diff --git a/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java index 52dabef7c5d..d02ac2ae2be 100644 --- a/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java +++ b/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java @@ -19,9 +19,6 @@ package org.elasticsearch.index.analysis; -import java.util.HashSet; -import java.util.List; - import org.apache.commons.codec.Encoder; import org.apache.commons.codec.language.Caverphone1; import org.apache.commons.codec.language.Caverphone2; @@ -45,6 +42,9 @@ import org.elasticsearch.index.analysis.phonetic.HaasePhonetik; import org.elasticsearch.index.analysis.phonetic.KoelnerPhonetik; import org.elasticsearch.index.analysis.phonetic.Nysiis; +import java.util.HashSet; +import java.util.List; + public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory { private final Encoder encoder; @@ -116,11 +116,11 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory { public TokenStream create(TokenStream tokenStream) { if (encoder == null) { if (ruletype != null && nametype != null) { - if (languageset != null) { - final LanguageSet languages = LanguageSet.from(new HashSet<>(languageset)); - return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages); + LanguageSet langset = null; + if (languageset != null && languageset.size() > 0) { + langset = LanguageSet.from(new HashSet<>(languageset)); } - return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true)); + return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), langset); } if (maxcodelength > 0) { return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace); diff --git a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java index 2f84d05563b..e3877faee31 100644 --- a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java +++ b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java @@ -19,6 +19,9 @@ package org.elasticsearch.index.analysis; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; @@ -26,19 +29,47 @@ import org.elasticsearch.index.Index; import org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin; import org.elasticsearch.test.ESTestCase; import org.hamcrest.MatcherAssert; +import org.junit.Before; import java.io.IOException; +import java.io.StringReader; import static org.hamcrest.Matchers.instanceOf; public class SimplePhoneticAnalysisTests extends ESTestCase { - public void testPhoneticTokenFilterFactory() throws IOException { + + private TestAnalysis analysis; + + @Before + public void setup() throws IOException { String yaml = "/org/elasticsearch/index/analysis/phonetic-1.yml"; Settings settings = Settings.builder().loadFromStream(yaml, getClass().getResourceAsStream(yaml), false) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .build(); - TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisPhoneticPlugin()); + this.analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisPhoneticPlugin()); + } + + public void testPhoneticTokenFilterFactory() throws IOException { TokenFilterFactory filterFactory = analysis.tokenFilter.get("phonetic"); MatcherAssert.assertThat(filterFactory, instanceOf(PhoneticTokenFilterFactory.class)); } + + public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException { + TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter"); + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader("ABADIAS")); + String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio", "abadioS", "abadios", "abodia", + "abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias", "avadios", "avodias", "avodios", "obadia", "obadiaS", + "obadias", "obadio", "obadioS", "obadios", "obodia", "obodiaS", "obodias", "obodioS" }; + BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected); + } + + public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException { + TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench"); + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader("Rimbault")); + String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt", + "rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" }; + BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected); + } } diff --git a/plugins/analysis-phonetic/src/test/resources/org/elasticsearch/index/analysis/phonetic-1.yml b/plugins/analysis-phonetic/src/test/resources/org/elasticsearch/index/analysis/phonetic-1.yml index 6c0a0763881..1909c7ee063 100644 --- a/plugins/analysis-phonetic/src/test/resources/org/elasticsearch/index/analysis/phonetic-1.yml +++ b/plugins/analysis-phonetic/src/test/resources/org/elasticsearch/index/analysis/phonetic-1.yml @@ -19,6 +19,10 @@ index: beidermorsefilter: type: phonetic encoder: beidermorse + beidermorsefilterfrench: + type: phonetic + encoder: beidermorse + languageset : [ "french" ] koelnerphonetikfilter: type: phonetic encoder: koelnerphonetik