From 801f621b5274f9ebdd2bf93f4abca5862d0eae48 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sun, 10 Jun 2012 16:04:42 +0200 Subject: [PATCH] expose individual kuromoji token fitlers & tokenizers in elasticsearch in addition to the japanese analyzer --- README.md | 2 +- .../KuromojiAnalysisBinderProcessor.java | 39 ++++ .../analysis/KuromojiAnalyzerProvider.java | 21 +- .../KuromojiBaseFormFilterFactory.java | 41 ++++ .../KuromojiKatakanaStemmerFactory.java | 47 ++++ .../KuromojiPartOfSpeechFilterFactory.java | 60 +++++ .../KuromojiReadingFormFilterFactory.java | 46 ++++ .../analysis/KuromojiTokenizerFactory.java | 98 ++++++++ .../analysis/KuromojiIndicesAnalysis.java | 121 ++++++++++ .../KuromojiIndicesAnalysisModule.java | 32 +++ .../kuromoji/AnalysisKuromojiPlugin.java | 3 + .../index/analysis/KuromojiAnalysisTests.java | 218 ++++++++++++++++++ .../index/analysis/kuromoji_analysis.json | 31 +++ src/test/resources/es-plugin.properties | 1 + 14 files changed, 745 insertions(+), 15 deletions(-) create mode 100644 src/main/java/org/elasticsearch/index/analysis/KuromojiAnalysisBinderProcessor.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/KuromojiBaseFormFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/KuromojiKatakanaStemmerFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/KuromojiReadingFormFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java create mode 100644 src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java create mode 100644 src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysisModule.java create mode 100644 src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java create mode 100644 src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json create mode 100644 src/test/resources/es-plugin.properties diff --git a/README.md b/README.md index 8f19fe8be82..fe9e72e3d76 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The Japanese (kuromoji) Analysis plugin integrates Lucene kuromoji analysis modu In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-kuromoji/1.0.0`. -------------------------------------------------- - | Smart Chinese Analysis Plugin | ElasticSearch | + | Kuromoji Analysis Plugin | ElasticSearch | -------------------------------------------------- | master | 0.19 -> master | -------------------------------------------------- diff --git a/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalysisBinderProcessor.java new file mode 100644 index 00000000000..a761b92c950 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalysisBinderProcessor.java @@ -0,0 +1,39 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + + +/** + */ +public class KuromojiAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { + + @Override + public void processTokenizers(TokenizersBindings tokenizersBindings) { + tokenizersBindings.processTokenizer("kuromoji_tokenizer", KuromojiTokenizerFactory.class); + } + + @Override + public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { + tokenFiltersBindings.processTokenFilter("kuromoji_baseform", KuromojiBaseFormFilterFactory.class); + tokenFiltersBindings.processTokenFilter("kuromoji_part_of_speech", KuromojiPartOfSpeechFilterFactory.class); + tokenFiltersBindings.processTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory.class); + tokenFiltersBindings.processTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory.class); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalyzerProvider.java index 857e3f89dd8..1e59f46709f 100644 --- a/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalyzerProvider.java +++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalyzerProvider.java @@ -22,6 +22,7 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; import org.apache.lucene.analysis.ja.JapaneseTokenizer; +import org.apache.lucene.analysis.ja.dict.UserDictionary; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; @@ -40,24 +41,16 @@ public class KuromojiAnalyzerProvider extends AbstractIndexAnalyzerProvider stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet(), version); - JapaneseTokenizer.Mode mode = JapaneseTokenizer.DEFAULT_MODE; - String modeSetting = settings.get("mode", null); - if (modeSetting != null) { - if ("search".equalsIgnoreCase(modeSetting)) { - mode = JapaneseTokenizer.Mode.SEARCH; - } else if ("normal".equalsIgnoreCase(modeSetting)) { - mode = JapaneseTokenizer.Mode.NORMAL; - } else if ("extended".equalsIgnoreCase(modeSetting)) { - mode = JapaneseTokenizer.Mode.EXTENDED; - } - } - - analyzer = new JapaneseAnalyzer(version, null, mode, CharArraySet.copy(version, stopWords), JapaneseAnalyzer.getDefaultStopTags()); + final Set stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet(), version); + final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings); + final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings); + analyzer = new JapaneseAnalyzer(version, userDictionary, mode, CharArraySet.copy(version, stopWords), JapaneseAnalyzer.getDefaultStopTags()); } @Override public JapaneseAnalyzer get() { return this.analyzer; } + + } diff --git a/src/main/java/org/elasticsearch/index/analysis/KuromojiBaseFormFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/KuromojiBaseFormFilterFactory.java new file mode 100644 index 00000000000..d9a1bcfd879 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiBaseFormFilterFactory.java @@ -0,0 +1,41 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapaneseBaseFormFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +public class KuromojiBaseFormFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public KuromojiBaseFormFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new JapaneseBaseFormFilter(tokenStream); + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/KuromojiKatakanaStemmerFactory.java b/src/main/java/org/elasticsearch/index/analysis/KuromojiKatakanaStemmerFactory.java new file mode 100644 index 00000000000..32064031232 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiKatakanaStemmerFactory.java @@ -0,0 +1,47 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +public class KuromojiKatakanaStemmerFactory extends AbstractTokenFilterFactory { + + private final int minimumLength; + + @Inject + public KuromojiKatakanaStemmerFactory(Index index, + @IndexSettings Settings indexSettings, @Assisted String name, + @Assisted Settings settings) { + super(index, indexSettings, name, settings); + minimumLength = settings.getAsInt("minimum_length", + JapaneseKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH).intValue(); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new JapaneseKatakanaStemFilter(tokenStream, minimumLength); + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java new file mode 100644 index 00000000000..9ccf3f96f22 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java @@ -0,0 +1,60 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +public class KuromojiPartOfSpeechFilterFactory extends + AbstractTokenFilterFactory { + + private final boolean enablePositionIncrements; + private final Set stopTags = new HashSet(); + + @Inject + public KuromojiPartOfSpeechFilterFactory(Index index, + @IndexSettings Settings indexSettings, Environment env, + @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + List wordList = Analysis.getWordList(env, settings, "stoptags"); + if (wordList != null) { + stopTags.addAll(wordList); + } + this.enablePositionIncrements = settings.getAsBoolean( + "enable_position_increments", true); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new JapanesePartOfSpeechStopFilter(enablePositionIncrements, + tokenStream, stopTags); + } + +} diff --git a/src/main/java/org/elasticsearch/index/analysis/KuromojiReadingFormFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/KuromojiReadingFormFilterFactory.java new file mode 100644 index 00000000000..ffe0e75b917 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiReadingFormFilterFactory.java @@ -0,0 +1,46 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapaneseReadingFormFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +public class KuromojiReadingFormFilterFactory extends + AbstractTokenFilterFactory { + private final boolean useRomaji; + + @Inject + public KuromojiReadingFormFilterFactory(Index index, + @IndexSettings Settings indexSettings, @Assisted String name, + @Assisted Settings settings) { + super(index, indexSettings, name, settings); + useRomaji = settings.getAsBoolean("use_romaji", false); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new JapaneseReadingFormFilter(tokenStream, useRomaji); + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java new file mode 100644 index 00000000000..10e9946942a --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java @@ -0,0 +1,98 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ja.JapaneseTokenizer; +import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; +import org.apache.lucene.analysis.ja.dict.UserDictionary; +import org.elasticsearch.ElasticSearchException; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + */ +public class KuromojiTokenizerFactory extends AbstractTokenizerFactory { + + private static final String USER_DICT_OPTION = "user_dictionary"; + + private final UserDictionary userDictionary; + private final Mode mode; + + private boolean discartPunctuation; + + @Inject + public KuromojiTokenizerFactory(Index index, + @IndexSettings Settings indexSettings, Environment env, + @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + mode = getMode(settings); + userDictionary = getUserDictionary(env, settings); + discartPunctuation = settings.getAsBoolean("discard_punctuation", true); + } + + public static UserDictionary getUserDictionary(Environment env, + Settings settings) { + try { + final Reader reader = Analysis.getReaderFromFile(env, settings, + USER_DICT_OPTION); + if (reader == null) { + return null; + } else { + try { + return new UserDictionary(reader); + } finally { + reader.close(); + } + } + } catch (IOException e) { + throw new ElasticSearchException( + "failed to load kuromoji user dictionary", e); + } + } + + public static JapaneseTokenizer.Mode getMode(Settings settings) { + JapaneseTokenizer.Mode mode = JapaneseTokenizer.DEFAULT_MODE; + String modeSetting = settings.get("mode", null); + if (modeSetting != null) { + if ("search".equalsIgnoreCase(modeSetting)) { + mode = JapaneseTokenizer.Mode.SEARCH; + } else if ("normal".equalsIgnoreCase(modeSetting)) { + mode = JapaneseTokenizer.Mode.NORMAL; + } else if ("extended".equalsIgnoreCase(modeSetting)) { + mode = JapaneseTokenizer.Mode.EXTENDED; + } + } + return mode; + } + + @Override + public Tokenizer create(Reader reader) { + return new JapaneseTokenizer(reader, userDictionary, + discartPunctuation, mode); + } + +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java b/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java new file mode 100644 index 00000000000..06d6e16b309 --- /dev/null +++ b/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java @@ -0,0 +1,121 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.indices.analysis; + +import java.io.Reader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ja.JapaneseAnalyzer; +import org.apache.lucene.analysis.ja.JapaneseBaseFormFilter; +import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter; +import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter; +import org.apache.lucene.analysis.ja.JapaneseReadingFormFilter; +import org.apache.lucene.analysis.ja.JapaneseTokenizer; +import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; +import org.elasticsearch.common.component.AbstractComponent; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory; +import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; + +/** + * Registers indices level analysis components so, if not explicitly configured, + * will be shared among all indices. + */ +public class KuromojiIndicesAnalysis extends AbstractComponent { + + @Inject + public KuromojiIndicesAnalysis(Settings settings, + IndicesAnalysisService indicesAnalysisService) { + super(settings); + + indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer", + new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override + public String name() { + return "kuromoji_tokenizer"; + } + + @Override + public Tokenizer create(Reader reader) { + return new JapaneseTokenizer(reader, null, true, + Mode.SEARCH); + } + })); + + indicesAnalysisService.tokenFilterFactories().put("kuromoji_baseform", + new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override + public String name() { + return "kuromoji_baseform"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new JapaneseBaseFormFilter(tokenStream); + } + })); + + indicesAnalysisService.tokenFilterFactories().put( + "kuromoji_part_of_speech", + new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override + public String name() { + return "kuromoji_part_of_speech"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new JapanesePartOfSpeechStopFilter(false, + tokenStream, JapaneseAnalyzer + .getDefaultStopTags()); + } + })); + + indicesAnalysisService.tokenFilterFactories().put( + "kuromoji_readingform", + new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override + public String name() { + return "kuromoji_readingform"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new JapaneseReadingFormFilter(tokenStream, true); + } + })); + + indicesAnalysisService.tokenFilterFactories().put("kuromoji_stemmer", + new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override + public String name() { + return "kuromoji_stemmer"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new JapaneseKatakanaStemFilter(tokenStream); + } + })); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysisModule.java b/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysisModule.java new file mode 100644 index 00000000000..1c44342057a --- /dev/null +++ b/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysisModule.java @@ -0,0 +1,32 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.indices.analysis; + +import org.elasticsearch.common.inject.AbstractModule; + +/** + */ +public class KuromojiIndicesAnalysisModule extends AbstractModule { + + @Override + protected void configure() { + bind(KuromojiIndicesAnalysis.class).asEagerSingleton(); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java index a3ba70ad5cf..7a0fcb5d890 100644 --- a/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java +++ b/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java @@ -19,6 +19,7 @@ package org.elasticsearch.plugin.analysis.kuromoji; +import org.elasticsearch.index.analysis.KuromojiAnalysisBinderProcessor; import org.elasticsearch.index.analysis.KuromojiAnalyzerProvider; import org.elasticsearch.index.analysis.AnalysisModule; import org.elasticsearch.plugins.AbstractPlugin; @@ -40,5 +41,7 @@ public class AnalysisKuromojiPlugin extends AbstractPlugin { public void onModule(AnalysisModule module) { module.addAnalyzer("kuromoji", KuromojiAnalyzerProvider.class); + module.addProcessor(new KuromojiAnalysisBinderProcessor()); + } } diff --git a/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java new file mode 100644 index 00000000000..bd82192b7d6 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java @@ -0,0 +1,218 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; +import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS; +import static org.hamcrest.Matchers.instanceOf; + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ja.JapaneseAnalyzer; +import org.apache.lucene.analysis.ja.JapaneseTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.elasticsearch.plugin.analysis.kuromoji.AnalysisKuromojiPlugin; +import org.elasticsearch.plugins.PluginsModule; +import org.elasticsearch.plugins.PluginsService; +import org.hamcrest.MatcherAssert; +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + */ +public class KuromojiAnalysisTests { + + @Test + public void testDefaultsKuromojiAnalysis() { + Index index = new Index("test"); + + Injector parentInjector = new ModulesBuilder().add( + new SettingsModule(EMPTY_SETTINGS), + new EnvironmentModule(new Environment(EMPTY_SETTINGS)), + new IndicesAnalysisModule()).createInjector(); + AnalysisModule analysisModule = new AnalysisModule(EMPTY_SETTINGS, + parentInjector.getInstance(IndicesAnalysisService.class)); + new AnalysisKuromojiPlugin().onModule(analysisModule); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, EMPTY_SETTINGS), + new IndexNameModule(index), analysisModule) + .createChildInjector(parentInjector); + + AnalysisService analysisService = injector + .getInstance(AnalysisService.class); + + TokenizerFactory tokenizerFactory = analysisService + .tokenizer("kuromoji_tokenizer"); + MatcherAssert.assertThat(tokenizerFactory, + instanceOf(KuromojiTokenizerFactory.class)); + + TokenFilterFactory filterFactory = analysisService + .tokenFilter("kuromoji_part_of_speech"); + MatcherAssert.assertThat(filterFactory, + instanceOf(KuromojiPartOfSpeechFilterFactory.class)); + + filterFactory = analysisService.tokenFilter("kuromoji_readingform"); + MatcherAssert.assertThat(filterFactory, + instanceOf(KuromojiReadingFormFilterFactory.class)); + + filterFactory = analysisService.tokenFilter("kuromoji_baseform"); + MatcherAssert.assertThat(filterFactory, + instanceOf(KuromojiBaseFormFilterFactory.class)); + + filterFactory = analysisService.tokenFilter("kuromoji_stemmer"); + MatcherAssert.assertThat(filterFactory, + instanceOf(KuromojiKatakanaStemmerFactory.class)); + + NamedAnalyzer analyzer = analysisService.analyzer("kuromoji"); + MatcherAssert.assertThat(analyzer.analyzer(), + instanceOf(JapaneseAnalyzer.class)); + } + + @Test + public void testBaseFormFilterFactory() throws IOException { + AnalysisService analysisService = createAnalysisService(); + TokenFilterFactory tokenFilter = analysisService + .tokenFilter("kuromoji_pos"); + MatcherAssert.assertThat(tokenFilter, + instanceOf(KuromojiPartOfSpeechFilterFactory.class)); + String source = "私は制限スピードを超える。"; + String[] expected = new String[] { "私", "は", "制限", "スピード", "を" }; + Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(source), + null, true, JapaneseTokenizer.Mode.SEARCH); + assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); + + } + + @Test + public void testReadingFormFilterFactory() throws IOException { + AnalysisService analysisService = createAnalysisService(); + TokenFilterFactory tokenFilter = analysisService + .tokenFilter("kuromoji_rf"); + MatcherAssert.assertThat(tokenFilter, + instanceOf(KuromojiReadingFormFilterFactory.class)); + String source = "今夜はロバート先生と話した"; + String[] expected_tokens_romanji = new String[] { "kon'ya", "ha", + "robato", "sensei", "to", "hanashi", "ta" }; + + Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(source), + null, true, JapaneseTokenizer.Mode.SEARCH); + + assertSimpleTSOutput(tokenFilter.create(tokenizer), + expected_tokens_romanji); + + tokenizer = new JapaneseTokenizer(new StringReader(source), null, true, + JapaneseTokenizer.Mode.SEARCH); + String[] expected_tokens_katakana = new String[] { "コンヤ", "ハ", "ロバート", + "センセイ", "ト", "ハナシ", "タ" }; + tokenFilter = analysisService.tokenFilter("kuromoji_readingform"); + MatcherAssert.assertThat(tokenFilter, + instanceOf(KuromojiReadingFormFilterFactory.class)); + assertSimpleTSOutput(tokenFilter.create(tokenizer), + expected_tokens_katakana); + } + + @Test + public void testKatakanaStemFilter() throws IOException { + AnalysisService analysisService = createAnalysisService(); + TokenFilterFactory tokenFilter = analysisService + .tokenFilter("kuromoji_stemmer"); + MatcherAssert.assertThat(tokenFilter, + instanceOf(KuromojiKatakanaStemmerFactory.class)); + String source = "明後日パーティーに行く予定がある。図書館で資料をコピーしました。"; + ; + Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(source), + null, true, JapaneseTokenizer.Mode.SEARCH); + + // パーティー should be stemmed by default + // (min len) コピー should not be stemmed + String[] expected_tokens_katakana = new String[] { "明後日", "パーティ", "に", + "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", + "た" }; + assertSimpleTSOutput(tokenFilter.create(tokenizer), + expected_tokens_katakana); + + tokenFilter = analysisService.tokenFilter("kuromoji_ks"); + MatcherAssert.assertThat(tokenFilter, + instanceOf(KuromojiKatakanaStemmerFactory.class)); + tokenizer = new JapaneseTokenizer(new StringReader(source), null, true, + JapaneseTokenizer.Mode.SEARCH); + + // パーティー should not be stemmed since min len == 6 + // コピー should not be stemmed + expected_tokens_katakana = new String[] { "明後日", "パーティー", "に", "行く", + "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" }; + assertSimpleTSOutput(tokenFilter.create(tokenizer), + expected_tokens_katakana); + + } + + public AnalysisService createAnalysisService() { + Index index = new Index("test"); + Settings settings = settingsBuilder().loadFromClasspath( + "org/elasticsearch/index/analysis/kuromoji_analysis.json") + .build(); + Injector parentInjector = new ModulesBuilder().add( + new SettingsModule(settings), + new EnvironmentModule(new Environment(settings)), + new IndicesAnalysisModule()).createInjector(); + AnalysisModule analysisModule = new AnalysisModule(settings, + parentInjector.getInstance(IndicesAnalysisService.class)); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, settings), + new PluginsModule(settings, parentInjector + .getInstance(PluginsService.class)), + new IndexNameModule(index), analysisModule) + .createChildInjector(parentInjector); + + AnalysisService analysisService = injector + .getInstance(AnalysisService.class); + return analysisService; + } + + public static void assertSimpleTSOutput(TokenStream stream, + String[] expected) throws IOException { + stream.reset(); + CharTermAttribute termAttr = stream + .getAttribute(CharTermAttribute.class); + Assert.assertNotNull(termAttr); + int i = 0; + while (stream.incrementToken()) { + Assert.assertTrue(i < expected.length); + Assert.assertEquals(expected[i++], termAttr.toString(), + "expected different term at index " + i); + } + Assert.assertEquals(i, expected.length, "not all tokens produced"); + } + +} \ No newline at end of file diff --git a/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json b/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json new file mode 100644 index 00000000000..de8dd28770f --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json @@ -0,0 +1,31 @@ +{ + "index":{ + "analysis":{ + "filter":{ + "kuromoji_rf":{ + "type":"kuromoji_readingform", + "use_romaji" : "true" + }, + "kuromoji_pos" : { + "type": "kuromoji_part_of_speech", + "enable_position_increment" : "false", + "stoptags" : ["# verb-main:", "動詞-自立"] + }, + "kuromoji_ks" : { + "type": "kuromoji_stemmer", + "minimum_length" : 6 + } + + + }, + + "tokenizer" : { + "kuromoji" : { + "type":"kuromoji_tokenizer" + } + + } + + } + } +} \ No newline at end of file diff --git a/src/test/resources/es-plugin.properties b/src/test/resources/es-plugin.properties new file mode 100644 index 00000000000..c35abda2c62 --- /dev/null +++ b/src/test/resources/es-plugin.properties @@ -0,0 +1 @@ +plugin=org.elasticsearch.plugin.analysis.kuromoji.AnalysisKuromojiPlugin