Add "ja_stop" filter

* can use a predefined "_japanese_" stop words * can not use other predefined stop words * upgrade to lucene 5 * add ja_stop to README Closes #45
2025-03-09 14:34:43 +00:00 · 2014-10-21 18:07:00 +09:00 · 2014-10-21 18:07:00 +09:00 · 0a0d6fd644
commit 0a0d6fd644
parent d0f629b0f5
5 changed files with 143 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -45,6 +45,7 @@ The plugin includes these analyzer and tokenizer, tokenfilter.
 | kuromoji_part_of_speech | tokenfilter |
 | kuromoji_readingform    | tokenfilter |
 | kuromoji_stemmer        | tokenfilter |
+| ja_stop                 | tokenfilter |


 Usage
@ -475,6 +476,50 @@ _Response :_
 ```


+## TokenFilter : kuromoji_part_of_speech
+
+
+A token filter of type `ja_stop` that provide a predefined "_japanese_" stop words.
+*Note: It is only provide "_japanese_". If you want to use other predefined stop words, you can use `stop` token filter.*
+
+### example
+
+```sh
+curl -XPUT 'http://localhost:9200/kuromoji_sample/' -d'
+{
+    "settings": {
+        "index":{
+            "analysis":{
+                "analyzer" : {
+                    "analyzer_with_ja_stop" : {
+                        "tokenizer" : "kuromoji_tokenizer",
+                        "filter" : ["ja_stop"]
+                    }
+                },
+                "filter" : {
+                    "ja_stop" : {
+                        "type" : "ja_stop",
+                        "stopwords" : ["_japanese_", "ストップ"]
+                    }
+                }
+            }
+        }
+    }
+}
+'
+
+curl -XPOST 'http://localhost:9200/kuromoji_sample/_analyze?analyzer=katakana_analyzer&pretty' -d 'ストップは消える'
+{
+  "tokens" : [ {
+    "token" : "消える",
+    "start_offset" : 5,
+    "end_offset" : 8,
+    "type" : "word",
+    "position" : 3
+  } ]
+}
+```
+
 License
 -------

--- a/src/main/java/org/elasticsearch/index/analysis/JapaneseStopTokenFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/JapaneseStopTokenFilterFactory.java
@ -0,0 +1,76 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
+import org.elasticsearch.common.collect.ImmutableMap;
+import org.elasticsearch.common.collect.MapBuilder;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.util.Set;
+
+public class JapaneseStopTokenFilterFactory extends AbstractTokenFilterFactory{
+
+
+    private final CharArraySet stopWords;
+
+    private final boolean ignoreCase;
+
+    private final boolean removeTrailing;
+
+    @Inject
+    public JapaneseStopTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+        this.ignoreCase = settings.getAsBoolean("ignore_case", false);
+        this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
+        ImmutableMap<String, Set<?>> namedStopWords = MapBuilder.<String, Set<?>>newMapBuilder()
+            .put("_japanese_", JapaneseAnalyzer.getDefaultStopSet())
+            .immutableMap();
+        this.stopWords = Analysis.parseWords(env, settings, "stopwords", JapaneseAnalyzer.getDefaultStopSet(), namedStopWords, ignoreCase);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        if (removeTrailing) {
+            return new StopFilter(tokenStream, stopWords);
+        } else {
+            return new SuggestStopFilter(tokenStream, stopWords);
+        }
+    }
+
+    public Set<?> stopWords() {
+        return stopWords;
+    }
+
+    public boolean ignoreCase() {
+        return ignoreCase;
+    }
+
+}
--- a/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
+++ b/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
@ -55,5 +55,6 @@ public class AnalysisKuromojiPlugin extends AbstractPlugin {
        module.addTokenFilter("kuromoji_part_of_speech", KuromojiPartOfSpeechFilterFactory.class);
        module.addTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory.class);
        module.addTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory.class);
+        module.addTokenFilter("ja_stop", JapaneseStopTokenFilterFactory.class);
    }
 }
--- a/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
@ -71,6 +71,9 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
        filterFactory = analysisService.tokenFilter("kuromoji_stemmer");
        assertThat(filterFactory, instanceOf(KuromojiKatakanaStemmerFactory.class));

+        filterFactory = analysisService.tokenFilter("ja_stop");
+        assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));
+
        NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
        assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));

@ -80,6 +83,7 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {

        CharFilterFactory  charFilterFactory = analysisService.charFilter("kuromoji_iteration_mark");
        assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));
+
    }

    @Test
@ -172,10 +176,21 @@ public class KuromojiAnalysisTests extends ElasticsearchTestCase {
        expected = "ところどころ、ジジが、時時、馬鹿馬鹿しい";

        assertCharFilterEquals(charFilterFactory.create(new StringReader(source)), expected);
-
-
    }

+    @Test
+    public void testJapaneseStopFilterFactory() throws IOException {
+        AnalysisService analysisService = createAnalysisService();
+        TokenFilterFactory tokenFilter = analysisService.tokenFilter("ja_stop");
+        assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class));
+        String source = "私は制限スピードを超える。";
+        String[] expected = new String[]{"私", "制限", "超える"};
+        Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
+        tokenizer.setReader(new StringReader(source));
+        assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+    }
+
+
    public AnalysisService createAnalysisService() {
        Settings settings = ImmutableSettings.settingsBuilder()
                .loadFromClasspath("org/elasticsearch/index/analysis/kuromoji_analysis.json")
--- a/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json
+++ b/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json
@ -13,9 +13,12 @@
                "kuromoji_ks" : {
                    "type": "kuromoji_stemmer",
                    "minimum_length" : 6
+                },
+                "ja_stop" : {
+                    "type": "ja_stop",
+                    "stopwords": ["_japanese_", "スピード"]
                }
                
-                
            },

            "char_filter":{