Analysis Kuromoji: Add nbest option and NumberFilter
Add nbest_cost and nbest_examples parameter to KuromojiTokenizerFactory Add KuromojiNumberFilterFactory
This commit is contained in:
parent
b07a8185a7
commit
a9a0f262af
|
@ -122,6 +122,28 @@ dictionary to `$ES_HOME/config/userdict_ja.txt`:
|
|||
東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
|
||||
-----------------------
|
||||
|
||||
`nbest_cost`/`nbest_examples`::
|
||||
+
|
||||
--
|
||||
Additional expert user parameters `nbest_cost` and `nbest_examples` can be used
|
||||
to include additional tokens that most likely according to the statistical model.
|
||||
If both parameters are used, the largest number of both is applied.
|
||||
|
||||
`nbest_cost`::
|
||||
|
||||
The `nbest_cost` parameter specifies an additional Viterbi cost.
|
||||
The KuromojiTokenizer will include all tokens in Viterbi paths that are
|
||||
within the nbest_cost value of the best path.
|
||||
|
||||
`nbest_examples`::
|
||||
|
||||
The `nbest_examples` can be used to find a `nbest_cost` value based on examples.
|
||||
For example, a value of /箱根山-箱根/成田空港-成田/ indicates that in the texts,
|
||||
箱根山 (Mt. Hakone) and 成田空港 (Narita Airport) we'd like a cost that gives is us
|
||||
箱根 (Hakone) and 成田 (Narita).
|
||||
--
|
||||
|
||||
|
||||
Then create an analyzer as follows:
|
||||
|
||||
[source,json]
|
||||
|
@ -452,3 +474,48 @@ The above request returns:
|
|||
}
|
||||
--------------------------------------------------
|
||||
|
||||
[[analysis-kuromoji-number]]
|
||||
===== `kuromoji_number` token filter
|
||||
|
||||
The `kuromoji_number` token filter normalizes Japanese numbers (kansūji)
|
||||
to regular Arabic decimal numbers in half-width characters.
|
||||
|
||||
[source,json]
|
||||
--------------------------------------------------
|
||||
PUT kuromoji_sample
|
||||
{
|
||||
"settings": {
|
||||
"index": {
|
||||
"analysis": {
|
||||
"analyzer": {
|
||||
"my_analyzer": {
|
||||
"tokenizer": "kuromoji_tokenizer",
|
||||
"filter": [
|
||||
"kuromoji_number"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
POST kuromoji_sample/_analyze?analyzer=my_analyzer&text=一〇〇〇
|
||||
|
||||
--------------------------------------------------
|
||||
// AUTOSENSE
|
||||
|
||||
[source,text]
|
||||
--------------------------------------------------
|
||||
# Result
|
||||
{
|
||||
"tokens" : [ {
|
||||
"token" : "1000",
|
||||
"start_offset" : 0,
|
||||
"end_offset" : 4,
|
||||
"type" : "word",
|
||||
"position" : 1
|
||||
} ]
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ja.JapaneseNumberFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
||||
public class KuromojiNumberFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
public KuromojiNumberFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new JapaneseNumberFilter(tokenStream);
|
||||
}
|
||||
}
|
|
@ -36,9 +36,13 @@ import java.io.Reader;
|
|||
public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
private static final String USER_DICT_OPTION = "user_dictionary";
|
||||
private static final String NBEST_COST = "nbest_cost";
|
||||
private static final String NBEST_EXAMPLES = "nbest_examples";
|
||||
|
||||
private final UserDictionary userDictionary;
|
||||
private final Mode mode;
|
||||
private final String nBestExamples;
|
||||
private final int nBestCost;
|
||||
|
||||
private boolean discartPunctuation;
|
||||
|
||||
|
@ -47,6 +51,8 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
|
|||
mode = getMode(settings);
|
||||
userDictionary = getUserDictionary(env, settings);
|
||||
discartPunctuation = settings.getAsBoolean("discard_punctuation", true);
|
||||
nBestCost = settings.getAsInt(NBEST_COST, -1);
|
||||
nBestExamples = settings.get(NBEST_EXAMPLES);
|
||||
}
|
||||
|
||||
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
|
||||
|
@ -83,7 +89,13 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
|
|||
|
||||
@Override
|
||||
public Tokenizer create() {
|
||||
return new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
|
||||
JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
|
||||
int nBestCost = this.nBestCost;
|
||||
if (nBestExamples != null) {
|
||||
nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples));
|
||||
}
|
||||
t.setNBestCost(nBestCost);
|
||||
return t;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.elasticsearch.index.analysis.KuromojiAnalyzerProvider;
|
|||
import org.elasticsearch.index.analysis.KuromojiBaseFormFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KuromojiIterationMarkCharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KuromojiKatakanaStemmerFactory;
|
||||
import org.elasticsearch.index.analysis.KuromojiNumberFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KuromojiPartOfSpeechFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KuromojiReadingFormFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KuromojiTokenizerFactory;
|
||||
|
@ -55,5 +56,6 @@ public class AnalysisKuromojiPlugin extends Plugin {
|
|||
module.registerTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory::new);
|
||||
module.registerTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new);
|
||||
module.registerTokenFilter("ja_stop", JapaneseStopTokenFilterFactory::new);
|
||||
module.registerTokenFilter("kuromoji_number", KuromojiNumberFilterFactory::new);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,11 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
|
||||
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.common.inject.Injector;
|
||||
import org.elasticsearch.common.inject.ModulesBuilder;
|
||||
|
@ -75,6 +79,9 @@ public class KuromojiAnalysisTests extends ESTestCase {
|
|||
filterFactory = analysisService.tokenFilter("ja_stop");
|
||||
assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));
|
||||
|
||||
filterFactory = analysisService.tokenFilter("kuromoji_number");
|
||||
assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class));
|
||||
|
||||
NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
|
||||
assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));
|
||||
|
||||
|
@ -262,4 +269,49 @@ public class KuromojiAnalysisTests extends ESTestCase {
|
|||
TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_empty_user_dict");
|
||||
assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class));
|
||||
}
|
||||
|
||||
public void testNbestCost() throws IOException {
|
||||
AnalysisService analysisService = createAnalysisService();
|
||||
TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_cost");
|
||||
String source = "鳩山積み";
|
||||
String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"};
|
||||
|
||||
Tokenizer tokenizer = tokenizerFactory.create();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertSimpleTSOutput(tokenizer, expected);
|
||||
}
|
||||
|
||||
public void testNbestExample() throws IOException {
|
||||
AnalysisService analysisService = createAnalysisService();
|
||||
TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_examples");
|
||||
String source = "鳩山積み";
|
||||
String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"};
|
||||
|
||||
Tokenizer tokenizer = tokenizerFactory.create();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertSimpleTSOutput(tokenizer, expected);
|
||||
}
|
||||
|
||||
public void testNbestBothOptions() throws IOException {
|
||||
AnalysisService analysisService = createAnalysisService();
|
||||
TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_both");
|
||||
String source = "鳩山積み";
|
||||
String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"};
|
||||
|
||||
Tokenizer tokenizer = tokenizerFactory.create();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertSimpleTSOutput(tokenizer, expected);
|
||||
|
||||
}
|
||||
|
||||
public void testNumberFilterFactory() throws Exception {
|
||||
AnalysisService analysisService = createAnalysisService();
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_number");
|
||||
assertThat(tokenFilter, instanceOf(KuromojiNumberFilterFactory.class));
|
||||
String source = "本日十万二千五百円のワインを買った";
|
||||
String[] expected = new String[]{"本日", "102500", "円", "の", "ワイン", "を", "買っ", "た"};
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
"type": "ja_stop",
|
||||
"stopwords": ["_japanese_", "スピード"]
|
||||
}
|
||||
|
||||
},
|
||||
|
||||
"char_filter":{
|
||||
|
@ -48,6 +47,19 @@
|
|||
"kuromoji_user_dict" : {
|
||||
"type":"kuromoji_tokenizer",
|
||||
"user_dictionary":"user_dict.txt"
|
||||
},
|
||||
"kuromoji_nbest_cost" : {
|
||||
"type": "kuromoji_tokenizer",
|
||||
"nbest_cost" : "2000"
|
||||
},
|
||||
"kuromoji_nbest_examples" : {
|
||||
"type": "kuromoji_tokenizer",
|
||||
"nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/"
|
||||
},
|
||||
"kuromoji_nbest_both" : {
|
||||
"type": "kuromoji_tokenizer",
|
||||
"nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/",
|
||||
"nbest_cost" : "1000"
|
||||
}
|
||||
},
|
||||
"analyzer" : {
|
||||
|
|
Loading…
Reference in New Issue