Fix kuromoji default stoptags (#26600)

Initialize the default stop-tags in `KuromojiPartOfSpeechFilterFactory` if the
`stoptags` are not given in the config. Also adding a test which checks that 
part-of-speech tokens are removed when using the kuromoji_part_of_speech 
filter.
This commit is contained in:
Claudio Bley 2017-09-15 12:25:09 +02:00 committed by Christoph Büscher
parent 7f74a620a1
commit 7184cf8b5b
2 changed files with 19 additions and 1 deletions

View File

@ -20,6 +20,7 @@
package org.elasticsearch.index.analysis; package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter; import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
@ -38,6 +39,8 @@ public class KuromojiPartOfSpeechFilterFactory extends AbstractTokenFilterFactor
List<String> wordList = Analysis.getWordList(env, settings, "stoptags"); List<String> wordList = Analysis.getWordList(env, settings, "stoptags");
if (wordList != null) { if (wordList != null) {
stopTags.addAll(wordList); stopTags.addAll(wordList);
} else {
stopTags.addAll(JapaneseAnalyzer.getDefaultStopTags());
} }
} }

View File

@ -93,6 +93,21 @@ public class KuromojiAnalysisTests extends ESTestCase {
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
} }
public void testPartOfSpeechFilter() throws IOException {
TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_part_of_speech");
assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
String source = "寿司がおいしいね";
String[] expected_tokens = new String[]{"寿司", "おいしい"};
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
}
public void testReadingFormFilterFactory() throws IOException { public void testReadingFormFilterFactory() throws IOException {
TestAnalysis analysis = createTestAnalysis(); TestAnalysis analysis = createTestAnalysis();
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_rf"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_rf");
@ -208,7 +223,7 @@ public class KuromojiAnalysisTests extends ESTestCase {
int i = 0; int i = 0;
while (stream.incrementToken()) { while (stream.incrementToken()) {
assertThat(expected.length, greaterThan(i)); assertThat(expected.length, greaterThan(i));
assertThat( "expected different term at index " + i, expected[i++], equalTo(termAttr.toString())); assertThat("expected different term at index " + i, termAttr.toString(), equalTo(expected[i++]));
} }
assertThat("not all tokens produced", i, equalTo(expected.length)); assertThat("not all tokens produced", i, equalTo(expected.length));
} }