Fix kuromoji default stoptags (#26600)
Initialize the default stop-tags in `KuromojiPartOfSpeechFilterFactory` if the `stoptags` are not given in the config. Also adding a test which checks that part-of-speech tokens are removed when using the kuromoji_part_of_speech filter.
This commit is contained in:
parent
7f74a620a1
commit
7184cf8b5b
|
@ -20,6 +20,7 @@
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
|
||||||
import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
|
import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
|
@ -38,6 +39,8 @@ public class KuromojiPartOfSpeechFilterFactory extends AbstractTokenFilterFactor
|
||||||
List<String> wordList = Analysis.getWordList(env, settings, "stoptags");
|
List<String> wordList = Analysis.getWordList(env, settings, "stoptags");
|
||||||
if (wordList != null) {
|
if (wordList != null) {
|
||||||
stopTags.addAll(wordList);
|
stopTags.addAll(wordList);
|
||||||
|
} else {
|
||||||
|
stopTags.addAll(JapaneseAnalyzer.getDefaultStopTags());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -93,6 +93,21 @@ public class KuromojiAnalysisTests extends ESTestCase {
|
||||||
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testPartOfSpeechFilter() throws IOException {
|
||||||
|
TestAnalysis analysis = createTestAnalysis();
|
||||||
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_part_of_speech");
|
||||||
|
|
||||||
|
assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
|
||||||
|
|
||||||
|
String source = "寿司がおいしいね";
|
||||||
|
String[] expected_tokens = new String[]{"寿司", "おいしい"};
|
||||||
|
|
||||||
|
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||||
|
tokenizer.setReader(new StringReader(source));
|
||||||
|
|
||||||
|
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
public void testReadingFormFilterFactory() throws IOException {
|
public void testReadingFormFilterFactory() throws IOException {
|
||||||
TestAnalysis analysis = createTestAnalysis();
|
TestAnalysis analysis = createTestAnalysis();
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_rf");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_rf");
|
||||||
|
@ -208,7 +223,7 @@ public class KuromojiAnalysisTests extends ESTestCase {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while (stream.incrementToken()) {
|
while (stream.incrementToken()) {
|
||||||
assertThat(expected.length, greaterThan(i));
|
assertThat(expected.length, greaterThan(i));
|
||||||
assertThat( "expected different term at index " + i, expected[i++], equalTo(termAttr.toString()));
|
assertThat("expected different term at index " + i, termAttr.toString(), equalTo(expected[i++]));
|
||||||
}
|
}
|
||||||
assertThat("not all tokens produced", i, equalTo(expected.length));
|
assertThat("not all tokens produced", i, equalTo(expected.length));
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue