Analysis Kuromoji: Add nbest option and NumberFilter

Add nbest_cost and nbest_examples parameter to KuromojiTokenizerFactory
Add KuromojiNumberFilterFactory
This commit is contained in:
Jun Ohtani 2016-03-16 17:43:21 +09:00
parent b07a8185a7
commit a9a0f262af
6 changed files with 184 additions and 2 deletions

View File

@ -122,6 +122,28 @@ dictionary to `$ES_HOME/config/userdict_ja.txt`:
東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
-----------------------
`nbest_cost`/`nbest_examples`::
+
--
Additional expert user parameters `nbest_cost` and `nbest_examples` can be used
to include additional tokens that most likely according to the statistical model.
If both parameters are used, the largest number of both is applied.
`nbest_cost`::
The `nbest_cost` parameter specifies an additional Viterbi cost.
The KuromojiTokenizer will include all tokens in Viterbi paths that are
within the nbest_cost value of the best path.
`nbest_examples`::
The `nbest_examples` can be used to find a `nbest_cost` value based on examples.
For example, a value of /箱根山-箱根/成田空港-成田/ indicates that in the texts,
箱根山 (Mt. Hakone) and 成田空港 (Narita Airport) we'd like a cost that gives is us
箱根 (Hakone) and 成田 (Narita).
--
Then create an analyzer as follows:
[source,json]
@ -452,3 +474,48 @@ The above request returns:
}
--------------------------------------------------
[[analysis-kuromoji-number]]
===== `kuromoji_number` token filter
The `kuromoji_number` token filter normalizes Japanese numbers (kansūji)
to regular Arabic decimal numbers in half-width characters.
[source,json]
--------------------------------------------------
PUT kuromoji_sample
{
"settings": {
"index": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "kuromoji_tokenizer",
"filter": [
"kuromoji_number"
]
}
}
}
}
}
}
POST kuromoji_sample/_analyze?analyzer=my_analyzer&text=一〇〇〇
--------------------------------------------------
// AUTOSENSE
[source,text]
--------------------------------------------------
# Result
{
"tokens" : [ {
"token" : "1000",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 1
} ]
}
--------------------------------------------------

View File

@ -0,0 +1,37 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseNumberFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
public class KuromojiNumberFilterFactory extends AbstractTokenFilterFactory {
public KuromojiNumberFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new JapaneseNumberFilter(tokenStream);
}
}

View File

@ -36,9 +36,13 @@ import java.io.Reader;
public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
private static final String USER_DICT_OPTION = "user_dictionary";
private static final String NBEST_COST = "nbest_cost";
private static final String NBEST_EXAMPLES = "nbest_examples";
private final UserDictionary userDictionary;
private final Mode mode;
private final String nBestExamples;
private final int nBestCost;
private boolean discartPunctuation;
@ -47,6 +51,8 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
mode = getMode(settings);
userDictionary = getUserDictionary(env, settings);
discartPunctuation = settings.getAsBoolean("discard_punctuation", true);
nBestCost = settings.getAsInt(NBEST_COST, -1);
nBestExamples = settings.get(NBEST_EXAMPLES);
}
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
@ -83,7 +89,13 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
@Override
public Tokenizer create() {
return new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
int nBestCost = this.nBestCost;
if (nBestExamples != null) {
nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples));
}
t.setNBestCost(nBestCost);
return t;
}
}

View File

@ -24,6 +24,7 @@ import org.elasticsearch.index.analysis.KuromojiAnalyzerProvider;
import org.elasticsearch.index.analysis.KuromojiBaseFormFilterFactory;
import org.elasticsearch.index.analysis.KuromojiIterationMarkCharFilterFactory;
import org.elasticsearch.index.analysis.KuromojiKatakanaStemmerFactory;
import org.elasticsearch.index.analysis.KuromojiNumberFilterFactory;
import org.elasticsearch.index.analysis.KuromojiPartOfSpeechFilterFactory;
import org.elasticsearch.index.analysis.KuromojiReadingFormFilterFactory;
import org.elasticsearch.index.analysis.KuromojiTokenizerFactory;
@ -55,5 +56,6 @@ public class AnalysisKuromojiPlugin extends Plugin {
module.registerTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory::new);
module.registerTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new);
module.registerTokenFilter("ja_stop", JapaneseStopTokenFilterFactory::new);
module.registerTokenFilter("kuromoji_number", KuromojiNumberFilterFactory::new);
}
}

View File

@ -24,7 +24,11 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.Version;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
@ -75,6 +79,9 @@ public class KuromojiAnalysisTests extends ESTestCase {
filterFactory = analysisService.tokenFilter("ja_stop");
assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));
filterFactory = analysisService.tokenFilter("kuromoji_number");
assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class));
NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));
@ -262,4 +269,49 @@ public class KuromojiAnalysisTests extends ESTestCase {
TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_empty_user_dict");
assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class));
}
public void testNbestCost() throws IOException {
AnalysisService analysisService = createAnalysisService();
TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_cost");
String source = "鳩山積み";
String[] expected = new String[] {"", "鳩山", "山積み", "積み"};
Tokenizer tokenizer = tokenizerFactory.create();
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenizer, expected);
}
public void testNbestExample() throws IOException {
AnalysisService analysisService = createAnalysisService();
TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_examples");
String source = "鳩山積み";
String[] expected = new String[] {"", "鳩山", "山積み", "積み"};
Tokenizer tokenizer = tokenizerFactory.create();
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenizer, expected);
}
public void testNbestBothOptions() throws IOException {
AnalysisService analysisService = createAnalysisService();
TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_both");
String source = "鳩山積み";
String[] expected = new String[] {"", "鳩山", "山積み", "積み"};
Tokenizer tokenizer = tokenizerFactory.create();
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenizer, expected);
}
public void testNumberFilterFactory() throws Exception {
AnalysisService analysisService = createAnalysisService();
TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_number");
assertThat(tokenFilter, instanceOf(KuromojiNumberFilterFactory.class));
String source = "本日十万二千五百円のワインを買った";
String[] expected = new String[]{"本日", "102500", "", "", "ワイン", "", "買っ", ""};
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
}

View File

@ -18,7 +18,6 @@
"type": "ja_stop",
"stopwords": ["_japanese_", "スピード"]
}
},
"char_filter":{
@ -48,6 +47,19 @@
"kuromoji_user_dict" : {
"type":"kuromoji_tokenizer",
"user_dictionary":"user_dict.txt"
},
"kuromoji_nbest_cost" : {
"type": "kuromoji_tokenizer",
"nbest_cost" : "2000"
},
"kuromoji_nbest_examples" : {
"type": "kuromoji_tokenizer",
"nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/"
},
"kuromoji_nbest_both" : {
"type": "kuromoji_tokenizer",
"nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/",
"nbest_cost" : "1000"
}
},
"analyzer" : {