Analysis Kuromoji: Add nbest option and NumberFilter

Add nbest_cost and nbest_examples parameter to KuromojiTokenizerFactory Add KuromojiNumberFilterFactory
2025-03-25 09:28:27 +00:00 · 2016-03-16 17:43:21 +09:00 · 2016-03-16 17:43:21 +09:00 · a9a0f262af
commit a9a0f262af
parent b07a8185a7
6 changed files with 184 additions and 2 deletions
--- a/docs/plugins/analysis-kuromoji.asciidoc
+++ b/docs/plugins/analysis-kuromoji.asciidoc
@ -122,6 +122,28 @@ dictionary to `$ES_HOME/config/userdict_ja.txt`:
 東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
 -----------------------

+`nbest_cost`/`nbest_examples`::
+
+--
+Additional expert user parameters `nbest_cost` and `nbest_examples` can be used
+to include additional tokens that most likely according to the statistical model.
+If both parameters are used, the largest number of both is applied.
+
+`nbest_cost`::
+
+    The `nbest_cost` parameter specifies an additional Viterbi cost.
+    The KuromojiTokenizer will include all tokens in Viterbi paths that are
+    within the nbest_cost value of the best path.
+
+`nbest_examples`::
+
+    The `nbest_examples` can be used to find a `nbest_cost` value based on examples.
+    For example, a value of /箱根山-箱根/成田空港-成田/ indicates that in the texts,
+    箱根山 (Mt. Hakone) and 成田空港 (Narita Airport) we'd like a cost that gives is us
+    箱根 (Hakone) and 成田 (Narita).
+--
+
+
 Then create an analyzer as follows:

 [source,json]
@ -452,3 +474,48 @@ The above request returns:
 }
 --------------------------------------------------

+[[analysis-kuromoji-number]]
+===== `kuromoji_number` token filter
+
+The `kuromoji_number` token filter normalizes Japanese numbers (kansūji)
+to regular Arabic decimal numbers in half-width characters.
+
+[source,json]
+--------------------------------------------------
+PUT kuromoji_sample
+{
+  "settings": {
+    "index": {
+      "analysis": {
+        "analyzer": {
+          "my_analyzer": {
+            "tokenizer": "kuromoji_tokenizer",
+            "filter": [
+              "kuromoji_number"
+            ]
+          }
+        }
+      }
+    }
+  }
+}
+
+POST kuromoji_sample/_analyze?analyzer=my_analyzer&text=一〇〇〇
+
+--------------------------------------------------
+// AUTOSENSE
+
+[source,text]
+--------------------------------------------------
+# Result
+{
+  "tokens" : [ {
+    "token" : "1000",
+    "start_offset" : 0,
+    "end_offset" : 4,
+    "type" : "word",
+    "position" : 1
+  } ]
+}
+--------------------------------------------------
+
--- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java
+++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java
@ -0,0 +1,37 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ja.JapaneseNumberFilter;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+public class KuromojiNumberFilterFactory extends AbstractTokenFilterFactory {
+
+    public KuromojiNumberFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new JapaneseNumberFilter(tokenStream);
+    }
+}
--- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java
+++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java
@ -36,9 +36,13 @@ import java.io.Reader;
 public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {

    private static final String USER_DICT_OPTION = "user_dictionary";
+    private static final String NBEST_COST = "nbest_cost";
+    private static final String NBEST_EXAMPLES = "nbest_examples";

    private final UserDictionary userDictionary;
    private final Mode mode;
+    private final String nBestExamples;
+    private final int nBestCost;

    private boolean discartPunctuation;

@ -47,6 +51,8 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
        mode = getMode(settings);
        userDictionary = getUserDictionary(env, settings);
        discartPunctuation = settings.getAsBoolean("discard_punctuation", true);
+        nBestCost = settings.getAsInt(NBEST_COST, -1);
+        nBestExamples = settings.get(NBEST_EXAMPLES);
    }

    public static UserDictionary getUserDictionary(Environment env, Settings settings) {
@ -83,7 +89,13 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {

    @Override
    public Tokenizer create() {
-        return new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
+        JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
+        int nBestCost = this.nBestCost;
+        if (nBestExamples != null) {
+            nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples));
+        }
+        t.setNBestCost(nBestCost);
+        return t;
    }

 }
--- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
+++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
@ -24,6 +24,7 @@ import org.elasticsearch.index.analysis.KuromojiAnalyzerProvider;
 import org.elasticsearch.index.analysis.KuromojiBaseFormFilterFactory;
 import org.elasticsearch.index.analysis.KuromojiIterationMarkCharFilterFactory;
 import org.elasticsearch.index.analysis.KuromojiKatakanaStemmerFactory;
+import org.elasticsearch.index.analysis.KuromojiNumberFilterFactory;
 import org.elasticsearch.index.analysis.KuromojiPartOfSpeechFilterFactory;
 import org.elasticsearch.index.analysis.KuromojiReadingFormFilterFactory;
 import org.elasticsearch.index.analysis.KuromojiTokenizerFactory;
@ -55,5 +56,6 @@ public class AnalysisKuromojiPlugin extends Plugin {
        module.registerTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory::new);
        module.registerTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new);
        module.registerTokenFilter("ja_stop", JapaneseStopTokenFilterFactory::new);
+        module.registerTokenFilter("kuromoji_number", KuromojiNumberFilterFactory::new);
    }
 }
--- a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
+++ b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
@ -24,7 +24,11 @@ import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.elasticsearch.Version;
+import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.common.inject.Injector;
 import org.elasticsearch.common.inject.ModulesBuilder;
@ -75,6 +79,9 @@ public class KuromojiAnalysisTests extends ESTestCase {
        filterFactory = analysisService.tokenFilter("ja_stop");
        assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));

+        filterFactory = analysisService.tokenFilter("kuromoji_number");
+        assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class));
+
        NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
        assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));

@ -262,4 +269,49 @@ public class KuromojiAnalysisTests extends ESTestCase {
        TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_empty_user_dict");
        assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class));
    }
+
+    public void testNbestCost() throws IOException {
+        AnalysisService analysisService = createAnalysisService();
+        TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_cost");
+        String source = "鳩山積み";
+        String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"};
+
+        Tokenizer tokenizer = tokenizerFactory.create();
+        tokenizer.setReader(new StringReader(source));
+        assertSimpleTSOutput(tokenizer, expected);
+    }
+
+    public void testNbestExample() throws IOException {
+        AnalysisService analysisService = createAnalysisService();
+        TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_examples");
+        String source = "鳩山積み";
+        String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"};
+
+        Tokenizer tokenizer = tokenizerFactory.create();
+        tokenizer.setReader(new StringReader(source));
+        assertSimpleTSOutput(tokenizer, expected);
+    }
+
+    public void testNbestBothOptions() throws IOException {
+        AnalysisService analysisService = createAnalysisService();
+        TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_both");
+        String source = "鳩山積み";
+        String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"};
+
+        Tokenizer tokenizer = tokenizerFactory.create();
+        tokenizer.setReader(new StringReader(source));
+        assertSimpleTSOutput(tokenizer, expected);
+
+    }
+
+    public void testNumberFilterFactory() throws Exception {
+        AnalysisService analysisService = createAnalysisService();
+        TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_number");
+        assertThat(tokenFilter, instanceOf(KuromojiNumberFilterFactory.class));
+        String source = "本日十万二千五百円のワインを買った";
+        String[] expected = new String[]{"本日", "102500", "円", "の", "ワイン", "を", "買っ", "た"};
+        Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
+        tokenizer.setReader(new StringReader(source));
+        assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+    }
 }
--- a/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json
+++ b/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json
@ -18,7 +18,6 @@
                    "type": "ja_stop",
                    "stopwords": ["_japanese_", "スピード"]
                }
-                
            },

            "char_filter":{
@ -48,6 +47,19 @@
                "kuromoji_user_dict" : {
                    "type":"kuromoji_tokenizer",
                    "user_dictionary":"user_dict.txt"
+                },
+                "kuromoji_nbest_cost" : {
+                    "type": "kuromoji_tokenizer",
+                    "nbest_cost" : "2000"
+                },
+                "kuromoji_nbest_examples" : {
+                    "type": "kuromoji_tokenizer",
+                    "nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/"
+                },
+                "kuromoji_nbest_both" : {
+                    "type": "kuromoji_tokenizer",
+                    "nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/",
+                    "nbest_cost" : "1000"
                }
            },
            "analyzer" : {