Fix kuromoji default stoptags (#26600)

Initialize the default stop-tags in `KuromojiPartOfSpeechFilterFactory` if the `stoptags` are not given in the config. Also adding a test which checks that part-of-speech tokens are removed when using the kuromoji_part_of_speech filter.
2017-09-15 12:25:09 +02:00 · 2017-09-15 12:25:09 +02:00 · 7184cf8b5b
parent 7f74a620a1
commit 7184cf8b5b
2 changed files with 19 additions and 1 deletions
--- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java
+++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java
@ -20,6 +20,7 @@
 package org.elasticsearch.index.analysis;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
 import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
@ -38,6 +39,8 @@ public class KuromojiPartOfSpeechFilterFactory extends AbstractTokenFilterFactor
        List<String> wordList = Analysis.getWordList(env, settings, "stoptags");
        if (wordList != null) {
            stopTags.addAll(wordList);
        } else {
            stopTags.addAll(JapaneseAnalyzer.getDefaultStopTags());
        }
    }
--- a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
+++ b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
@ -93,6 +93,21 @@ public class KuromojiAnalysisTests extends ESTestCase {
        assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
    }
    public void testPartOfSpeechFilter() throws IOException {
        TestAnalysis analysis = createTestAnalysis();
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_part_of_speech");
        assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
        String source = "寿司がおいしいね";
        String[] expected_tokens = new String[]{"寿司", "おいしい"};
        Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
        tokenizer.setReader(new StringReader(source));
        assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
    }
    public void testReadingFormFilterFactory() throws IOException {
        TestAnalysis analysis = createTestAnalysis();
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_rf");
@ -208,7 +223,7 @@ public class KuromojiAnalysisTests extends ESTestCase {
        int i = 0;
        while (stream.incrementToken()) {
            assertThat(expected.length, greaterThan(i));
-            assertThat( "expected different term at index " + i, expected[i++], equalTo(termAttr.toString()));
+            assertThat("expected different term at index " + i, termAttr.toString(), equalTo(expected[i++]));
        }
        assertThat("not all tokens produced", i, equalTo(expected.length));
    }