Fix beidermorse phonetic token filter for unspecified `languageset` (#27112)
Currently, when we create a BeiderMorseFilter with an unspecified `languageset`, the filter will not guess the language, which should be the default behaviour. This change fixes this and adds a simple test for the cases with and without provided `languageset` settings. Closes #26771
This commit is contained in:
parent
6625ecfff4
commit
9253ea8aec
|
@ -19,9 +19,6 @@
|
|||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.codec.Encoder;
|
||||
import org.apache.commons.codec.language.Caverphone1;
|
||||
import org.apache.commons.codec.language.Caverphone2;
|
||||
|
@ -45,6 +42,9 @@ import org.elasticsearch.index.analysis.phonetic.HaasePhonetik;
|
|||
import org.elasticsearch.index.analysis.phonetic.KoelnerPhonetik;
|
||||
import org.elasticsearch.index.analysis.phonetic.Nysiis;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private final Encoder encoder;
|
||||
|
@ -116,11 +116,11 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenStream create(TokenStream tokenStream) {
|
||||
if (encoder == null) {
|
||||
if (ruletype != null && nametype != null) {
|
||||
if (languageset != null) {
|
||||
final LanguageSet languages = LanguageSet.from(new HashSet<>(languageset));
|
||||
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages);
|
||||
LanguageSet langset = null;
|
||||
if (languageset != null && languageset.size() > 0) {
|
||||
langset = LanguageSet.from(new HashSet<>(languageset));
|
||||
}
|
||||
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true));
|
||||
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), langset);
|
||||
}
|
||||
if (maxcodelength > 0) {
|
||||
return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace);
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
@ -26,19 +29,47 @@ import org.elasticsearch.index.Index;
|
|||
import org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
import org.junit.Before;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
|
||||
public class SimplePhoneticAnalysisTests extends ESTestCase {
|
||||
public void testPhoneticTokenFilterFactory() throws IOException {
|
||||
|
||||
private TestAnalysis analysis;
|
||||
|
||||
@Before
|
||||
public void setup() throws IOException {
|
||||
String yaml = "/org/elasticsearch/index/analysis/phonetic-1.yml";
|
||||
Settings settings = Settings.builder().loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.build();
|
||||
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisPhoneticPlugin());
|
||||
this.analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisPhoneticPlugin());
|
||||
}
|
||||
|
||||
public void testPhoneticTokenFilterFactory() throws IOException {
|
||||
TokenFilterFactory filterFactory = analysis.tokenFilter.get("phonetic");
|
||||
MatcherAssert.assertThat(filterFactory, instanceOf(PhoneticTokenFilterFactory.class));
|
||||
}
|
||||
|
||||
public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException {
|
||||
TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader("ABADIAS"));
|
||||
String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio", "abadioS", "abadios", "abodia",
|
||||
"abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias", "avadios", "avodias", "avodios", "obadia", "obadiaS",
|
||||
"obadias", "obadio", "obadioS", "obadios", "obodia", "obodiaS", "obodias", "obodioS" };
|
||||
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException {
|
||||
TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader("Rimbault"));
|
||||
String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt",
|
||||
"rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
|
||||
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,10 @@ index:
|
|||
beidermorsefilter:
|
||||
type: phonetic
|
||||
encoder: beidermorse
|
||||
beidermorsefilterfrench:
|
||||
type: phonetic
|
||||
encoder: beidermorse
|
||||
languageset : [ "french" ]
|
||||
koelnerphonetikfilter:
|
||||
type: phonetic
|
||||
encoder: koelnerphonetik
|
||||
|
|
Loading…
Reference in New Issue