expose individual kuromoji token fitlers & tokenizers in elasticsearch in addition to the japanese analyzer

2012-06-10 16:04:42 +02:00 · 2012-06-10 16:04:42 +02:00 · 801f621b52
parent 9356f463f2
commit 801f621b52
14 changed files with 745 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@ The Japanese (kuromoji) Analysis plugin integrates Lucene kuromoji analysis modu
 In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-kuromoji/1.0.0`.
    --------------------------------------------------
-    | Smart Chinese Analysis Plugin | ElasticSearch  |
+    | Kuromoji Analysis Plugin      | ElasticSearch  |
    --------------------------------------------------
    | master                        | 0.19 -> master |
    --------------------------------------------------
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalysisBinderProcessor.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalysisBinderProcessor.java
@ -0,0 +1,39 @@
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis;
 /**
 */
 public class KuromojiAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
    @Override
    public void processTokenizers(TokenizersBindings tokenizersBindings) {
      tokenizersBindings.processTokenizer("kuromoji_tokenizer", KuromojiTokenizerFactory.class);
    }
    @Override
    public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
        tokenFiltersBindings.processTokenFilter("kuromoji_baseform", KuromojiBaseFormFilterFactory.class);
        tokenFiltersBindings.processTokenFilter("kuromoji_part_of_speech", KuromojiPartOfSpeechFilterFactory.class);
        tokenFiltersBindings.processTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory.class);
        tokenFiltersBindings.processTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory.class);
    }
 }
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalyzerProvider.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalyzerProvider.java
@ -22,6 +22,7 @@ package org.elasticsearch.index.analysis;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer;
 import org.apache.lucene.analysis.ja.dict.UserDictionary;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
@ -40,24 +41,16 @@ public class KuromojiAnalyzerProvider extends AbstractIndexAnalyzerProvider<Japa
    @Inject
    public KuromojiAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
        super(index, indexSettings, name, settings);
-        Set<?> stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet(), version);
+        final Set<?> stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet(), version);
-        JapaneseTokenizer.Mode mode = JapaneseTokenizer.DEFAULT_MODE;
+        final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings);
-        String modeSetting = settings.get("mode", null);
+        final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
-        if (modeSetting != null) {
+        analyzer = new JapaneseAnalyzer(version, userDictionary, mode, CharArraySet.copy(version, stopWords), JapaneseAnalyzer.getDefaultStopTags());
            if ("search".equalsIgnoreCase(modeSetting)) {
                mode = JapaneseTokenizer.Mode.SEARCH;
            } else if ("normal".equalsIgnoreCase(modeSetting)) {
                mode = JapaneseTokenizer.Mode.NORMAL;
            } else if ("extended".equalsIgnoreCase(modeSetting)) {
                mode = JapaneseTokenizer.Mode.EXTENDED;
            }
        }
        analyzer = new JapaneseAnalyzer(version, null, mode, CharArraySet.copy(version, stopWords), JapaneseAnalyzer.getDefaultStopTags());
    }
    @Override
    public JapaneseAnalyzer get() {
        return this.analyzer;
    }
 }
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiBaseFormFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiBaseFormFilterFactory.java
@ -0,0 +1,41 @@
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ja.JapaneseBaseFormFilter;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.settings.IndexSettings;
 public class KuromojiBaseFormFilterFactory extends AbstractTokenFilterFactory {
  @Inject
  public KuromojiBaseFormFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
      super(index, indexSettings, name, settings);
  }
  @Override
  public TokenStream create(TokenStream tokenStream) {
      return new JapaneseBaseFormFilter(tokenStream);
  }
 }
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiKatakanaStemmerFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiKatakanaStemmerFactory.java
@ -0,0 +1,47 @@
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.settings.IndexSettings;
 public class KuromojiKatakanaStemmerFactory extends AbstractTokenFilterFactory {
    private final int minimumLength;
    @Inject
    public KuromojiKatakanaStemmerFactory(Index index,
            @IndexSettings Settings indexSettings, @Assisted String name,
            @Assisted Settings settings) {
        super(index, indexSettings, name, settings);
        minimumLength = settings.getAsInt("minimum_length",
                JapaneseKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH).intValue();
    }
    @Override
    public TokenStream create(TokenStream tokenStream) {
        return new JapaneseKatakanaStemFilter(tokenStream, minimumLength);
    }
 }
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java
@ -0,0 +1,60 @@
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.settings.IndexSettings;
 public class KuromojiPartOfSpeechFilterFactory extends
        AbstractTokenFilterFactory {
    private final boolean enablePositionIncrements;
    private final Set<String> stopTags = new HashSet<String>();
    @Inject
    public KuromojiPartOfSpeechFilterFactory(Index index,
            @IndexSettings Settings indexSettings, Environment env,
            @Assisted String name, @Assisted Settings settings) {
        super(index, indexSettings, name, settings);
        List<String> wordList = Analysis.getWordList(env, settings, "stoptags");
        if (wordList != null) {
            stopTags.addAll(wordList);
        }
        this.enablePositionIncrements = settings.getAsBoolean(
                "enable_position_increments", true);
    }
    @Override
    public TokenStream create(TokenStream tokenStream) {
        return new JapanesePartOfSpeechStopFilter(enablePositionIncrements,
                tokenStream, stopTags);
    }
 }
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiReadingFormFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiReadingFormFilterFactory.java
@ -0,0 +1,46 @@
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ja.JapaneseReadingFormFilter;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.settings.IndexSettings;
 public class KuromojiReadingFormFilterFactory extends
        AbstractTokenFilterFactory {
    private final boolean useRomaji;
    @Inject
    public KuromojiReadingFormFilterFactory(Index index,
            @IndexSettings Settings indexSettings, @Assisted String name,
            @Assisted Settings settings) {
        super(index, indexSettings, name, settings);
        useRomaji = settings.getAsBoolean("use_romaji", false);
    }
    @Override
    public TokenStream create(TokenStream tokenStream) {
        return new JapaneseReadingFormFilter(tokenStream, useRomaji);
    }
 }
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java
@ -0,0 +1,98 @@
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis;
 import java.io.IOException;
 import java.io.Reader;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
 import org.apache.lucene.analysis.ja.dict.UserDictionary;
 import org.elasticsearch.ElasticSearchException;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.settings.IndexSettings;
 /**
 */
 public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
    private static final String USER_DICT_OPTION = "user_dictionary";
    private final UserDictionary userDictionary;
    private final Mode mode;
    private boolean discartPunctuation;
    @Inject
    public KuromojiTokenizerFactory(Index index,
            @IndexSettings Settings indexSettings, Environment env,
            @Assisted String name, @Assisted Settings settings) {
        super(index, indexSettings, name, settings);
        mode = getMode(settings);
        userDictionary = getUserDictionary(env, settings);
        discartPunctuation = settings.getAsBoolean("discard_punctuation", true);
    }
    public static UserDictionary getUserDictionary(Environment env,
            Settings settings) {
        try {
            final Reader reader = Analysis.getReaderFromFile(env, settings,
                    USER_DICT_OPTION);
            if (reader == null) {
                return null;
            } else {
                try {
                    return new UserDictionary(reader);
                } finally {
                    reader.close();
                }
            }
        } catch (IOException e) {
            throw new ElasticSearchException(
                    "failed to load kuromoji user dictionary", e);
        }
    }
    public static JapaneseTokenizer.Mode getMode(Settings settings) {
        JapaneseTokenizer.Mode mode = JapaneseTokenizer.DEFAULT_MODE;
        String modeSetting = settings.get("mode", null);
        if (modeSetting != null) {
            if ("search".equalsIgnoreCase(modeSetting)) {
                mode = JapaneseTokenizer.Mode.SEARCH;
            } else if ("normal".equalsIgnoreCase(modeSetting)) {
                mode = JapaneseTokenizer.Mode.NORMAL;
            } else if ("extended".equalsIgnoreCase(modeSetting)) {
                mode = JapaneseTokenizer.Mode.EXTENDED;
            }
        }
        return mode;
    }
    @Override
    public Tokenizer create(Reader reader) {
        return new JapaneseTokenizer(reader, userDictionary,
                discartPunctuation, mode);
    }
 }
--- a/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java
+++ b/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java
@ -0,0 +1,121 @@
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.indices.analysis;
 import java.io.Reader;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
 import org.apache.lucene.analysis.ja.JapaneseBaseFormFilter;
 import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter;
 import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
 import org.apache.lucene.analysis.ja.JapaneseReadingFormFilter;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
 import org.elasticsearch.common.component.AbstractComponent;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory;
 import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.index.analysis.TokenizerFactory;
 /**
 * Registers indices level analysis components so, if not explicitly configured,
 * will be shared among all indices.
 */
 public class KuromojiIndicesAnalysis extends AbstractComponent {
    @Inject
    public KuromojiIndicesAnalysis(Settings settings,
            IndicesAnalysisService indicesAnalysisService) {
        super(settings);
        indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer",
                new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
                    @Override
                    public String name() {
                        return "kuromoji_tokenizer";
                    }
                    @Override
                    public Tokenizer create(Reader reader) {
                        return new JapaneseTokenizer(reader, null, true,
                                Mode.SEARCH);
                    }
                }));
        indicesAnalysisService.tokenFilterFactories().put("kuromoji_baseform",
                new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                    @Override
                    public String name() {
                        return "kuromoji_baseform";
                    }
                    @Override
                    public TokenStream create(TokenStream tokenStream) {
                        return new JapaneseBaseFormFilter(tokenStream);
                    }
                }));
        indicesAnalysisService.tokenFilterFactories().put(
                "kuromoji_part_of_speech",
                new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                    @Override
                    public String name() {
                        return "kuromoji_part_of_speech";
                    }
                    @Override
                    public TokenStream create(TokenStream tokenStream) {
                        return new JapanesePartOfSpeechStopFilter(false,
                                tokenStream, JapaneseAnalyzer
                                        .getDefaultStopTags());
                    }
                }));
        indicesAnalysisService.tokenFilterFactories().put(
                "kuromoji_readingform",
                new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                    @Override
                    public String name() {
                        return "kuromoji_readingform";
                    }
                    @Override
                    public TokenStream create(TokenStream tokenStream) {
                        return new JapaneseReadingFormFilter(tokenStream, true);
                    }
                }));
        indicesAnalysisService.tokenFilterFactories().put("kuromoji_stemmer",
                new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                    @Override
                    public String name() {
                        return "kuromoji_stemmer";
                    }
                    @Override
                    public TokenStream create(TokenStream tokenStream) {
                        return new JapaneseKatakanaStemFilter(tokenStream);
                    }
                }));
    }
 }
--- a/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysisModule.java
+++ b/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysisModule.java
@ -0,0 +1,32 @@
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.indices.analysis;
 import org.elasticsearch.common.inject.AbstractModule;
 /**
 */
 public class KuromojiIndicesAnalysisModule extends AbstractModule {
    @Override
    protected void configure() {
        bind(KuromojiIndicesAnalysis.class).asEagerSingleton();
    }
 }
--- a/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
+++ b/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
@ -19,6 +19,7 @@
 package org.elasticsearch.plugin.analysis.kuromoji;
 import org.elasticsearch.index.analysis.KuromojiAnalysisBinderProcessor;
 import org.elasticsearch.index.analysis.KuromojiAnalyzerProvider;
 import org.elasticsearch.index.analysis.AnalysisModule;
 import org.elasticsearch.plugins.AbstractPlugin;
@ -40,5 +41,7 @@ public class AnalysisKuromojiPlugin extends AbstractPlugin {
    public void onModule(AnalysisModule module) {
        module.addAnalyzer("kuromoji", KuromojiAnalyzerProvider.class);
        module.addProcessor(new KuromojiAnalysisBinderProcessor());
    }
 }
--- a/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
@ -0,0 +1,218 @@
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.index.analysis;
 import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
 import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS;
 import static org.hamcrest.Matchers.instanceOf;
 import java.io.IOException;
 import java.io.StringReader;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.elasticsearch.common.inject.Injector;
 import org.elasticsearch.common.inject.ModulesBuilder;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.settings.SettingsModule;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.env.EnvironmentModule;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexNameModule;
 import org.elasticsearch.index.settings.IndexSettingsModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
 import org.elasticsearch.indices.analysis.IndicesAnalysisService;
 import org.elasticsearch.plugin.analysis.kuromoji.AnalysisKuromojiPlugin;
 import org.elasticsearch.plugins.PluginsModule;
 import org.elasticsearch.plugins.PluginsService;
 import org.hamcrest.MatcherAssert;
 import org.testng.Assert;
 import org.testng.annotations.Test;
 /**
 */
 public class KuromojiAnalysisTests {
    @Test
    public void testDefaultsKuromojiAnalysis() {
        Index index = new Index("test");
        Injector parentInjector = new ModulesBuilder().add(
                new SettingsModule(EMPTY_SETTINGS),
                new EnvironmentModule(new Environment(EMPTY_SETTINGS)),
                new IndicesAnalysisModule()).createInjector();
        AnalysisModule analysisModule = new AnalysisModule(EMPTY_SETTINGS,
                parentInjector.getInstance(IndicesAnalysisService.class));
        new AnalysisKuromojiPlugin().onModule(analysisModule);
        Injector injector = new ModulesBuilder().add(
                new IndexSettingsModule(index, EMPTY_SETTINGS),
                new IndexNameModule(index), analysisModule)
                .createChildInjector(parentInjector);
        AnalysisService analysisService = injector
                .getInstance(AnalysisService.class);
        TokenizerFactory tokenizerFactory = analysisService
                .tokenizer("kuromoji_tokenizer");
        MatcherAssert.assertThat(tokenizerFactory,
                instanceOf(KuromojiTokenizerFactory.class));
        TokenFilterFactory filterFactory = analysisService
                .tokenFilter("kuromoji_part_of_speech");
        MatcherAssert.assertThat(filterFactory,
                instanceOf(KuromojiPartOfSpeechFilterFactory.class));
        filterFactory = analysisService.tokenFilter("kuromoji_readingform");
        MatcherAssert.assertThat(filterFactory,
                instanceOf(KuromojiReadingFormFilterFactory.class));
        filterFactory = analysisService.tokenFilter("kuromoji_baseform");
        MatcherAssert.assertThat(filterFactory,
                instanceOf(KuromojiBaseFormFilterFactory.class));
        filterFactory = analysisService.tokenFilter("kuromoji_stemmer");
        MatcherAssert.assertThat(filterFactory,
                instanceOf(KuromojiKatakanaStemmerFactory.class));
        NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
        MatcherAssert.assertThat(analyzer.analyzer(),
                instanceOf(JapaneseAnalyzer.class));
    }
    @Test
    public void testBaseFormFilterFactory() throws IOException {
        AnalysisService analysisService = createAnalysisService();
        TokenFilterFactory tokenFilter = analysisService
                .tokenFilter("kuromoji_pos");
        MatcherAssert.assertThat(tokenFilter,
                instanceOf(KuromojiPartOfSpeechFilterFactory.class));
        String source = "私は制限スピードを超える。";
        String[] expected = new String[] { "私", "は", "制限", "スピード", "を" };
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(source),
                null, true, JapaneseTokenizer.Mode.SEARCH);
        assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
    }
    @Test
    public void testReadingFormFilterFactory() throws IOException {
        AnalysisService analysisService = createAnalysisService();
        TokenFilterFactory tokenFilter = analysisService
                .tokenFilter("kuromoji_rf");
        MatcherAssert.assertThat(tokenFilter,
                instanceOf(KuromojiReadingFormFilterFactory.class));
        String source = "今夜はロバート先生と話した";
        String[] expected_tokens_romanji = new String[] { "kon'ya", "ha",
                "robato", "sensei", "to", "hanashi", "ta" };
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(source),
                null, true, JapaneseTokenizer.Mode.SEARCH);
        assertSimpleTSOutput(tokenFilter.create(tokenizer),
                expected_tokens_romanji);
        tokenizer = new JapaneseTokenizer(new StringReader(source), null, true,
                JapaneseTokenizer.Mode.SEARCH);
        String[] expected_tokens_katakana = new String[] { "コンヤ", "ハ", "ロバート",
                "センセイ", "ト", "ハナシ", "タ" };
        tokenFilter = analysisService.tokenFilter("kuromoji_readingform");
        MatcherAssert.assertThat(tokenFilter,
                instanceOf(KuromojiReadingFormFilterFactory.class));
        assertSimpleTSOutput(tokenFilter.create(tokenizer),
                expected_tokens_katakana);
    }
    @Test
    public void testKatakanaStemFilter() throws IOException {
        AnalysisService analysisService = createAnalysisService();
        TokenFilterFactory tokenFilter = analysisService
                .tokenFilter("kuromoji_stemmer");
        MatcherAssert.assertThat(tokenFilter,
                instanceOf(KuromojiKatakanaStemmerFactory.class));
        String source = "明後日パーティーに行く予定がある。図書館で資料をコピーしました。";
        ;
        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(source),
                null, true, JapaneseTokenizer.Mode.SEARCH);
        // パーティー should be stemmed by default
        // (min len) コピー should not be stemmed
        String[] expected_tokens_katakana = new String[] { "明後日", "パーティ", "に",
                "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし",
                "た" };
        assertSimpleTSOutput(tokenFilter.create(tokenizer),
                expected_tokens_katakana);
        tokenFilter = analysisService.tokenFilter("kuromoji_ks");
        MatcherAssert.assertThat(tokenFilter,
                instanceOf(KuromojiKatakanaStemmerFactory.class));
        tokenizer = new JapaneseTokenizer(new StringReader(source), null, true,
                JapaneseTokenizer.Mode.SEARCH);
        // パーティー should not be stemmed since min len == 6
        // コピー should not be stemmed
        expected_tokens_katakana = new String[] { "明後日", "パーティー", "に", "行く",
                "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" };
        assertSimpleTSOutput(tokenFilter.create(tokenizer),
                expected_tokens_katakana);
    }
    public AnalysisService createAnalysisService() {
        Index index = new Index("test");
        Settings settings = settingsBuilder().loadFromClasspath(
                "org/elasticsearch/index/analysis/kuromoji_analysis.json")
                .build();
        Injector parentInjector = new ModulesBuilder().add(
                new SettingsModule(settings),
                new EnvironmentModule(new Environment(settings)),
                new IndicesAnalysisModule()).createInjector();
        AnalysisModule analysisModule = new AnalysisModule(settings,
                parentInjector.getInstance(IndicesAnalysisService.class));
        Injector injector = new ModulesBuilder().add(
                new IndexSettingsModule(index, settings),
                new PluginsModule(settings, parentInjector
                        .getInstance(PluginsService.class)),
                new IndexNameModule(index), analysisModule)
                .createChildInjector(parentInjector);
        AnalysisService analysisService = injector
                .getInstance(AnalysisService.class);
        return analysisService;
    }
    public static void assertSimpleTSOutput(TokenStream stream,
            String[] expected) throws IOException {
        stream.reset();
        CharTermAttribute termAttr = stream
                .getAttribute(CharTermAttribute.class);
        Assert.assertNotNull(termAttr);
        int i = 0;
        while (stream.incrementToken()) {
            Assert.assertTrue(i < expected.length);
            Assert.assertEquals(expected[i++], termAttr.toString(),
                    "expected different term at index " + i);
        }
        Assert.assertEquals(i, expected.length, "not all tokens produced");
    }
 }
--- a/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json
+++ b/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json
@ -0,0 +1,31 @@
 {
    "index":{
        "analysis":{
            "filter":{
                "kuromoji_rf":{
                    "type":"kuromoji_readingform",
                    "use_romaji" : "true"
                },
                "kuromoji_pos" : {
                    "type": "kuromoji_part_of_speech",
                    "enable_position_increment" : "false",
                    "stoptags" : ["#  verb-main:", "動詞-自立"]
                },
                "kuromoji_ks" : {
                    "type": "kuromoji_stemmer",
                    "minimum_length" : 6
                }
            },
            "tokenizer" : {
                "kuromoji" : {
                   "type":"kuromoji_tokenizer"
                }
            }
        }
    }
 }
--- a/src/test/resources/es-plugin.properties
+++ b/src/test/resources/es-plugin.properties
@ -0,0 +1 @@
 plugin=org.elasticsearch.plugin.analysis.kuromoji.AnalysisKuromojiPlugin
		`@ -0,0 +1 @@`
							`plugin=org.elasticsearch.plugin.analysis.kuromoji.AnalysisKuromojiPlugin`