diff --git a/docs/plugins/analysis-kuromoji.asciidoc b/docs/plugins/analysis-kuromoji.asciidoc index 8d1d6ac2d98..ff42287d03f 100644 --- a/docs/plugins/analysis-kuromoji.asciidoc +++ b/docs/plugins/analysis-kuromoji.asciidoc @@ -122,6 +122,28 @@ dictionary to `$ES_HOME/config/userdict_ja.txt`: 東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞 ----------------------- +`nbest_cost`/`nbest_examples`:: ++ +-- +Additional expert user parameters `nbest_cost` and `nbest_examples` can be used +to include additional tokens that most likely according to the statistical model. +If both parameters are used, the largest number of both is applied. + +`nbest_cost`:: + + The `nbest_cost` parameter specifies an additional Viterbi cost. + The KuromojiTokenizer will include all tokens in Viterbi paths that are + within the nbest_cost value of the best path. + +`nbest_examples`:: + + The `nbest_examples` can be used to find a `nbest_cost` value based on examples. + For example, a value of /箱根山-箱根/成田空港-成田/ indicates that in the texts, + 箱根山 (Mt. Hakone) and 成田空港 (Narita Airport) we'd like a cost that gives is us + 箱根 (Hakone) and 成田 (Narita). +-- + + Then create an analyzer as follows: [source,json] @@ -452,3 +474,48 @@ The above request returns: } -------------------------------------------------- +[[analysis-kuromoji-number]] +===== `kuromoji_number` token filter + +The `kuromoji_number` token filter normalizes Japanese numbers (kansūji) +to regular Arabic decimal numbers in half-width characters. + +[source,json] +-------------------------------------------------- +PUT kuromoji_sample +{ + "settings": { + "index": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "kuromoji_tokenizer", + "filter": [ + "kuromoji_number" + ] + } + } + } + } + } +} + +POST kuromoji_sample/_analyze?analyzer=my_analyzer&text=一〇〇〇 + +-------------------------------------------------- +// AUTOSENSE + +[source,text] +-------------------------------------------------- +# Result +{ + "tokens" : [ { + "token" : "1000", + "start_offset" : 0, + "end_offset" : 4, + "type" : "word", + "position" : 1 + } ] +} +-------------------------------------------------- + diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java new file mode 100644 index 00000000000..cb6b478957a --- /dev/null +++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java @@ -0,0 +1,37 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapaneseNumberFilter; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; + +public class KuromojiNumberFilterFactory extends AbstractTokenFilterFactory { + + public KuromojiNumberFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new JapaneseNumberFilter(tokenStream); + } +} diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java index 87e08c757b4..9e41621525a 100644 --- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java +++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java @@ -36,9 +36,13 @@ import java.io.Reader; public class KuromojiTokenizerFactory extends AbstractTokenizerFactory { private static final String USER_DICT_OPTION = "user_dictionary"; + private static final String NBEST_COST = "nbest_cost"; + private static final String NBEST_EXAMPLES = "nbest_examples"; private final UserDictionary userDictionary; private final Mode mode; + private final String nBestExamples; + private final int nBestCost; private boolean discartPunctuation; @@ -47,6 +51,8 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory { mode = getMode(settings); userDictionary = getUserDictionary(env, settings); discartPunctuation = settings.getAsBoolean("discard_punctuation", true); + nBestCost = settings.getAsInt(NBEST_COST, -1); + nBestExamples = settings.get(NBEST_EXAMPLES); } public static UserDictionary getUserDictionary(Environment env, Settings settings) { @@ -83,7 +89,13 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory { @Override public Tokenizer create() { - return new JapaneseTokenizer(userDictionary, discartPunctuation, mode); + JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode); + int nBestCost = this.nBestCost; + if (nBestExamples != null) { + nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples)); + } + t.setNBestCost(nBestCost); + return t; } } diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java index 6c0a15f2e39..4208b1be504 100644 --- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java +++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java @@ -24,6 +24,7 @@ import org.elasticsearch.index.analysis.KuromojiAnalyzerProvider; import org.elasticsearch.index.analysis.KuromojiBaseFormFilterFactory; import org.elasticsearch.index.analysis.KuromojiIterationMarkCharFilterFactory; import org.elasticsearch.index.analysis.KuromojiKatakanaStemmerFactory; +import org.elasticsearch.index.analysis.KuromojiNumberFilterFactory; import org.elasticsearch.index.analysis.KuromojiPartOfSpeechFilterFactory; import org.elasticsearch.index.analysis.KuromojiReadingFormFilterFactory; import org.elasticsearch.index.analysis.KuromojiTokenizerFactory; @@ -55,5 +56,6 @@ public class AnalysisKuromojiPlugin extends Plugin { module.registerTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory::new); module.registerTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new); module.registerTokenFilter("ja_stop", JapaneseStopTokenFilterFactory::new); + module.registerTokenFilter("kuromoji_number", KuromojiNumberFilterFactory::new); } } diff --git a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java index b81de20d73d..04d8d64cc75 100644 --- a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java +++ b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java @@ -24,7 +24,11 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; import org.apache.lucene.analysis.ja.JapaneseTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.elasticsearch.Version; +import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.inject.Injector; import org.elasticsearch.common.inject.ModulesBuilder; @@ -75,6 +79,9 @@ public class KuromojiAnalysisTests extends ESTestCase { filterFactory = analysisService.tokenFilter("ja_stop"); assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class)); + filterFactory = analysisService.tokenFilter("kuromoji_number"); + assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class)); + NamedAnalyzer analyzer = analysisService.analyzer("kuromoji"); assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class)); @@ -262,4 +269,49 @@ public class KuromojiAnalysisTests extends ESTestCase { TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_empty_user_dict"); assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class)); } + + public void testNbestCost() throws IOException { + AnalysisService analysisService = createAnalysisService(); + TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_cost"); + String source = "鳩山積み"; + String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"}; + + Tokenizer tokenizer = tokenizerFactory.create(); + tokenizer.setReader(new StringReader(source)); + assertSimpleTSOutput(tokenizer, expected); + } + + public void testNbestExample() throws IOException { + AnalysisService analysisService = createAnalysisService(); + TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_examples"); + String source = "鳩山積み"; + String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"}; + + Tokenizer tokenizer = tokenizerFactory.create(); + tokenizer.setReader(new StringReader(source)); + assertSimpleTSOutput(tokenizer, expected); + } + + public void testNbestBothOptions() throws IOException { + AnalysisService analysisService = createAnalysisService(); + TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_both"); + String source = "鳩山積み"; + String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"}; + + Tokenizer tokenizer = tokenizerFactory.create(); + tokenizer.setReader(new StringReader(source)); + assertSimpleTSOutput(tokenizer, expected); + + } + + public void testNumberFilterFactory() throws Exception { + AnalysisService analysisService = createAnalysisService(); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_number"); + assertThat(tokenFilter, instanceOf(KuromojiNumberFilterFactory.class)); + String source = "本日十万二千五百円のワインを買った"; + String[] expected = new String[]{"本日", "102500", "円", "の", "ワイン", "を", "買っ", "た"}; + Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); + tokenizer.setReader(new StringReader(source)); + assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); + } } diff --git a/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json b/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json index 58ed015b850..d0f94a2117b 100644 --- a/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json +++ b/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json @@ -18,7 +18,6 @@ "type": "ja_stop", "stopwords": ["_japanese_", "スピード"] } - }, "char_filter":{ @@ -48,6 +47,19 @@ "kuromoji_user_dict" : { "type":"kuromoji_tokenizer", "user_dictionary":"user_dict.txt" + }, + "kuromoji_nbest_cost" : { + "type": "kuromoji_tokenizer", + "nbest_cost" : "2000" + }, + "kuromoji_nbest_examples" : { + "type": "kuromoji_tokenizer", + "nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/" + }, + "kuromoji_nbest_both" : { + "type": "kuromoji_tokenizer", + "nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/", + "nbest_cost" : "1000" } }, "analyzer" : {