expose individual kuromoji token fitlers & tokenizers in elasticsearch in addition to the japanese analyzer

2012-06-10 16:04:42 +02:00 · 2012-06-10 16:04:42 +02:00 · 801f621b52
parent 9356f463f2
commit 801f621b52
14 changed files with 745 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@ The Japanese (kuromoji) Analysis plugin integrates Lucene kuromoji analysis modu
 In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-analysis-kuromoji/1.0.0`.

    --------------------------------------------------
-    | Smart Chinese Analysis Plugin | ElasticSearch  |
+    | Kuromoji Analysis Plugin      | ElasticSearch  |
    --------------------------------------------------
    | master                        | 0.19 -> master |
    --------------------------------------------------
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalysisBinderProcessor.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalysisBinderProcessor.java
@ -0,0 +1,39 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+
+/**
+ */
+public class KuromojiAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
+
+    @Override
+    public void processTokenizers(TokenizersBindings tokenizersBindings) {
+      tokenizersBindings.processTokenizer("kuromoji_tokenizer", KuromojiTokenizerFactory.class);
+    }
+
+    @Override
+    public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
+        tokenFiltersBindings.processTokenFilter("kuromoji_baseform", KuromojiBaseFormFilterFactory.class);
+        tokenFiltersBindings.processTokenFilter("kuromoji_part_of_speech", KuromojiPartOfSpeechFilterFactory.class);
+        tokenFiltersBindings.processTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory.class);
+        tokenFiltersBindings.processTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory.class);
+    }
+}
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalyzerProvider.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiAnalyzerProvider.java
@ -22,6 +22,7 @@ package org.elasticsearch.index.analysis;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer;
+import org.apache.lucene.analysis.ja.dict.UserDictionary;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
@ -40,24 +41,16 @@ public class KuromojiAnalyzerProvider extends AbstractIndexAnalyzerProvider<Japa
    @Inject
    public KuromojiAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
        super(index, indexSettings, name, settings);
-        Set<?> stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet(), version);
-        JapaneseTokenizer.Mode mode = JapaneseTokenizer.DEFAULT_MODE;
-        String modeSetting = settings.get("mode", null);
-        if (modeSetting != null) {
-            if ("search".equalsIgnoreCase(modeSetting)) {
-                mode = JapaneseTokenizer.Mode.SEARCH;
-            } else if ("normal".equalsIgnoreCase(modeSetting)) {
-                mode = JapaneseTokenizer.Mode.NORMAL;
-            } else if ("extended".equalsIgnoreCase(modeSetting)) {
-                mode = JapaneseTokenizer.Mode.EXTENDED;
-            }
-        }
-
-        analyzer = new JapaneseAnalyzer(version, null, mode, CharArraySet.copy(version, stopWords), JapaneseAnalyzer.getDefaultStopTags());
+        final Set<?> stopWords = Analysis.parseStopWords(env, settings, JapaneseAnalyzer.getDefaultStopSet(), version);
+        final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings);
+        final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
+        analyzer = new JapaneseAnalyzer(version, userDictionary, mode, CharArraySet.copy(version, stopWords), JapaneseAnalyzer.getDefaultStopTags());
    }

    @Override
    public JapaneseAnalyzer get() {
        return this.analyzer;
    }
+    
+    
 }
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiBaseFormFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiBaseFormFilterFactory.java
@ -0,0 +1,41 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ja.JapaneseBaseFormFilter;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+public class KuromojiBaseFormFilterFactory extends AbstractTokenFilterFactory {
+
+  @Inject
+  public KuromojiBaseFormFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
+      super(index, indexSettings, name, settings);
+  }
+
+  @Override
+  public TokenStream create(TokenStream tokenStream) {
+      return new JapaneseBaseFormFilter(tokenStream);
+  }
+}
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiKatakanaStemmerFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiKatakanaStemmerFactory.java
@ -0,0 +1,47 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+public class KuromojiKatakanaStemmerFactory extends AbstractTokenFilterFactory {
+
+    private final int minimumLength;
+
+    @Inject
+    public KuromojiKatakanaStemmerFactory(Index index,
+            @IndexSettings Settings indexSettings, @Assisted String name,
+            @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+        minimumLength = settings.getAsInt("minimum_length",
+                JapaneseKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH).intValue();
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new JapaneseKatakanaStemFilter(tokenStream, minimumLength);
+    }
+}
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java
@ -0,0 +1,60 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+public class KuromojiPartOfSpeechFilterFactory extends
+        AbstractTokenFilterFactory {
+
+    private final boolean enablePositionIncrements;
+    private final Set<String> stopTags = new HashSet<String>();
+
+    @Inject
+    public KuromojiPartOfSpeechFilterFactory(Index index,
+            @IndexSettings Settings indexSettings, Environment env,
+            @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+        List<String> wordList = Analysis.getWordList(env, settings, "stoptags");
+        if (wordList != null) {
+            stopTags.addAll(wordList);
+        }
+        this.enablePositionIncrements = settings.getAsBoolean(
+                "enable_position_increments", true);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new JapanesePartOfSpeechStopFilter(enablePositionIncrements,
+                tokenStream, stopTags);
+    }
+
+}
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiReadingFormFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiReadingFormFilterFactory.java
@ -0,0 +1,46 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ja.JapaneseReadingFormFilter;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+public class KuromojiReadingFormFilterFactory extends
+        AbstractTokenFilterFactory {
+    private final boolean useRomaji;
+
+    @Inject
+    public KuromojiReadingFormFilterFactory(Index index,
+            @IndexSettings Settings indexSettings, @Assisted String name,
+            @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+        useRomaji = settings.getAsBoolean("use_romaji", false);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new JapaneseReadingFormFilter(tokenStream, useRomaji);
+    }
+}
--- a/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java
@ -0,0 +1,98 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ja.JapaneseTokenizer;
+import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
+import org.apache.lucene.analysis.ja.dict.UserDictionary;
+import org.elasticsearch.ElasticSearchException;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+/**
+ */
+public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
+
+    private static final String USER_DICT_OPTION = "user_dictionary";
+
+    private final UserDictionary userDictionary;
+    private final Mode mode;
+
+    private boolean discartPunctuation;
+
+    @Inject
+    public KuromojiTokenizerFactory(Index index,
+            @IndexSettings Settings indexSettings, Environment env,
+            @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+        mode = getMode(settings);
+        userDictionary = getUserDictionary(env, settings);
+        discartPunctuation = settings.getAsBoolean("discard_punctuation", true);
+    }
+
+    public static UserDictionary getUserDictionary(Environment env,
+            Settings settings) {
+        try {
+            final Reader reader = Analysis.getReaderFromFile(env, settings,
+                    USER_DICT_OPTION);
+            if (reader == null) {
+                return null;
+            } else {
+                try {
+                    return new UserDictionary(reader);
+                } finally {
+                    reader.close();
+                }
+            }
+        } catch (IOException e) {
+            throw new ElasticSearchException(
+                    "failed to load kuromoji user dictionary", e);
+        }
+    }
+
+    public static JapaneseTokenizer.Mode getMode(Settings settings) {
+        JapaneseTokenizer.Mode mode = JapaneseTokenizer.DEFAULT_MODE;
+        String modeSetting = settings.get("mode", null);
+        if (modeSetting != null) {
+            if ("search".equalsIgnoreCase(modeSetting)) {
+                mode = JapaneseTokenizer.Mode.SEARCH;
+            } else if ("normal".equalsIgnoreCase(modeSetting)) {
+                mode = JapaneseTokenizer.Mode.NORMAL;
+            } else if ("extended".equalsIgnoreCase(modeSetting)) {
+                mode = JapaneseTokenizer.Mode.EXTENDED;
+            }
+        }
+        return mode;
+    }
+
+    @Override
+    public Tokenizer create(Reader reader) {
+        return new JapaneseTokenizer(reader, userDictionary,
+                discartPunctuation, mode);
+    }
+
+}
--- a/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java
+++ b/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysis.java
@ -0,0 +1,121 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.indices.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
+import org.apache.lucene.analysis.ja.JapaneseBaseFormFilter;
+import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter;
+import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
+import org.apache.lucene.analysis.ja.JapaneseReadingFormFilter;
+import org.apache.lucene.analysis.ja.JapaneseTokenizer;
+import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
+import org.elasticsearch.common.component.AbstractComponent;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.analysis.PreBuiltTokenFilterFactoryFactory;
+import org.elasticsearch.index.analysis.PreBuiltTokenizerFactoryFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
+
+/**
+ * Registers indices level analysis components so, if not explicitly configured,
+ * will be shared among all indices.
+ */
+public class KuromojiIndicesAnalysis extends AbstractComponent {
+
+    @Inject
+    public KuromojiIndicesAnalysis(Settings settings,
+            IndicesAnalysisService indicesAnalysisService) {
+        super(settings);
+
+        indicesAnalysisService.tokenizerFactories().put("kuromoji_tokenizer",
+                new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
+                    @Override
+                    public String name() {
+                        return "kuromoji_tokenizer";
+                    }
+
+                    @Override
+                    public Tokenizer create(Reader reader) {
+                        return new JapaneseTokenizer(reader, null, true,
+                                Mode.SEARCH);
+                    }
+                }));
+
+        indicesAnalysisService.tokenFilterFactories().put("kuromoji_baseform",
+                new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
+                    @Override
+                    public String name() {
+                        return "kuromoji_baseform";
+                    }
+
+                    @Override
+                    public TokenStream create(TokenStream tokenStream) {
+                        return new JapaneseBaseFormFilter(tokenStream);
+                    }
+                }));
+
+        indicesAnalysisService.tokenFilterFactories().put(
+                "kuromoji_part_of_speech",
+                new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
+                    @Override
+                    public String name() {
+                        return "kuromoji_part_of_speech";
+                    }
+
+                    @Override
+                    public TokenStream create(TokenStream tokenStream) {
+                        return new JapanesePartOfSpeechStopFilter(false,
+                                tokenStream, JapaneseAnalyzer
+                                        .getDefaultStopTags());
+                    }
+                }));
+
+        indicesAnalysisService.tokenFilterFactories().put(
+                "kuromoji_readingform",
+                new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
+                    @Override
+                    public String name() {
+                        return "kuromoji_readingform";
+                    }
+
+                    @Override
+                    public TokenStream create(TokenStream tokenStream) {
+                        return new JapaneseReadingFormFilter(tokenStream, true);
+                    }
+                }));
+
+        indicesAnalysisService.tokenFilterFactories().put("kuromoji_stemmer",
+                new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
+                    @Override
+                    public String name() {
+                        return "kuromoji_stemmer";
+                    }
+
+                    @Override
+                    public TokenStream create(TokenStream tokenStream) {
+                        return new JapaneseKatakanaStemFilter(tokenStream);
+                    }
+                }));
+    }
+}
--- a/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysisModule.java
+++ b/src/main/java/org/elasticsearch/indices/analysis/KuromojiIndicesAnalysisModule.java
@ -0,0 +1,32 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.indices.analysis;
+
+import org.elasticsearch.common.inject.AbstractModule;
+
+/**
+ */
+public class KuromojiIndicesAnalysisModule extends AbstractModule {
+
+    @Override
+    protected void configure() {
+        bind(KuromojiIndicesAnalysis.class).asEagerSingleton();
+    }
+}
--- a/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
+++ b/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
@ -19,6 +19,7 @@

 package org.elasticsearch.plugin.analysis.kuromoji;

+import org.elasticsearch.index.analysis.KuromojiAnalysisBinderProcessor;
 import org.elasticsearch.index.analysis.KuromojiAnalyzerProvider;
 import org.elasticsearch.index.analysis.AnalysisModule;
 import org.elasticsearch.plugins.AbstractPlugin;
@ -40,5 +41,7 @@ public class AnalysisKuromojiPlugin extends AbstractPlugin {

    public void onModule(AnalysisModule module) {
        module.addAnalyzer("kuromoji", KuromojiAnalyzerProvider.class);
+        module.addProcessor(new KuromojiAnalysisBinderProcessor());
+
    }
 }
--- a/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
+++ b/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
@ -0,0 +1,218 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
+import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS;
+import static org.hamcrest.Matchers.instanceOf;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
+import org.apache.lucene.analysis.ja.JapaneseTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.elasticsearch.common.inject.Injector;
+import org.elasticsearch.common.inject.ModulesBuilder;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.settings.SettingsModule;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.EnvironmentModule;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.IndexNameModule;
+import org.elasticsearch.index.settings.IndexSettingsModule;
+import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
+import org.elasticsearch.indices.analysis.IndicesAnalysisService;
+import org.elasticsearch.plugin.analysis.kuromoji.AnalysisKuromojiPlugin;
+import org.elasticsearch.plugins.PluginsModule;
+import org.elasticsearch.plugins.PluginsService;
+import org.hamcrest.MatcherAssert;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+/**
+ */
+public class KuromojiAnalysisTests {
+
+    @Test
+    public void testDefaultsKuromojiAnalysis() {
+        Index index = new Index("test");
+
+        Injector parentInjector = new ModulesBuilder().add(
+                new SettingsModule(EMPTY_SETTINGS),
+                new EnvironmentModule(new Environment(EMPTY_SETTINGS)),
+                new IndicesAnalysisModule()).createInjector();
+        AnalysisModule analysisModule = new AnalysisModule(EMPTY_SETTINGS,
+                parentInjector.getInstance(IndicesAnalysisService.class));
+        new AnalysisKuromojiPlugin().onModule(analysisModule);
+        Injector injector = new ModulesBuilder().add(
+                new IndexSettingsModule(index, EMPTY_SETTINGS),
+                new IndexNameModule(index), analysisModule)
+                .createChildInjector(parentInjector);
+
+        AnalysisService analysisService = injector
+                .getInstance(AnalysisService.class);
+
+        TokenizerFactory tokenizerFactory = analysisService
+                .tokenizer("kuromoji_tokenizer");
+        MatcherAssert.assertThat(tokenizerFactory,
+                instanceOf(KuromojiTokenizerFactory.class));
+
+        TokenFilterFactory filterFactory = analysisService
+                .tokenFilter("kuromoji_part_of_speech");
+        MatcherAssert.assertThat(filterFactory,
+                instanceOf(KuromojiPartOfSpeechFilterFactory.class));
+
+        filterFactory = analysisService.tokenFilter("kuromoji_readingform");
+        MatcherAssert.assertThat(filterFactory,
+                instanceOf(KuromojiReadingFormFilterFactory.class));
+
+        filterFactory = analysisService.tokenFilter("kuromoji_baseform");
+        MatcherAssert.assertThat(filterFactory,
+                instanceOf(KuromojiBaseFormFilterFactory.class));
+
+        filterFactory = analysisService.tokenFilter("kuromoji_stemmer");
+        MatcherAssert.assertThat(filterFactory,
+                instanceOf(KuromojiKatakanaStemmerFactory.class));
+
+        NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
+        MatcherAssert.assertThat(analyzer.analyzer(),
+                instanceOf(JapaneseAnalyzer.class));
+    }
+
+    @Test
+    public void testBaseFormFilterFactory() throws IOException {
+        AnalysisService analysisService = createAnalysisService();
+        TokenFilterFactory tokenFilter = analysisService
+                .tokenFilter("kuromoji_pos");
+        MatcherAssert.assertThat(tokenFilter,
+                instanceOf(KuromojiPartOfSpeechFilterFactory.class));
+        String source = "私は制限スピードを超える。";
+        String[] expected = new String[] { "私", "は", "制限", "スピード", "を" };
+        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(source),
+                null, true, JapaneseTokenizer.Mode.SEARCH);
+        assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+
+    }
+
+    @Test
+    public void testReadingFormFilterFactory() throws IOException {
+        AnalysisService analysisService = createAnalysisService();
+        TokenFilterFactory tokenFilter = analysisService
+                .tokenFilter("kuromoji_rf");
+        MatcherAssert.assertThat(tokenFilter,
+                instanceOf(KuromojiReadingFormFilterFactory.class));
+        String source = "今夜はロバート先生と話した";
+        String[] expected_tokens_romanji = new String[] { "kon'ya", "ha",
+                "robato", "sensei", "to", "hanashi", "ta" };
+
+        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(source),
+                null, true, JapaneseTokenizer.Mode.SEARCH);
+
+        assertSimpleTSOutput(tokenFilter.create(tokenizer),
+                expected_tokens_romanji);
+
+        tokenizer = new JapaneseTokenizer(new StringReader(source), null, true,
+                JapaneseTokenizer.Mode.SEARCH);
+        String[] expected_tokens_katakana = new String[] { "コンヤ", "ハ", "ロバート",
+                "センセイ", "ト", "ハナシ", "タ" };
+        tokenFilter = analysisService.tokenFilter("kuromoji_readingform");
+        MatcherAssert.assertThat(tokenFilter,
+                instanceOf(KuromojiReadingFormFilterFactory.class));
+        assertSimpleTSOutput(tokenFilter.create(tokenizer),
+                expected_tokens_katakana);
+    }
+
+    @Test
+    public void testKatakanaStemFilter() throws IOException {
+        AnalysisService analysisService = createAnalysisService();
+        TokenFilterFactory tokenFilter = analysisService
+                .tokenFilter("kuromoji_stemmer");
+        MatcherAssert.assertThat(tokenFilter,
+                instanceOf(KuromojiKatakanaStemmerFactory.class));
+        String source = "明後日パーティーに行く予定がある。図書館で資料をコピーしました。";
+        ;
+        Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(source),
+                null, true, JapaneseTokenizer.Mode.SEARCH);
+
+        // パーティー should be stemmed by default
+        // (min len) コピー should not be stemmed
+        String[] expected_tokens_katakana = new String[] { "明後日", "パーティ", "に",
+                "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし",
+                "た" };
+        assertSimpleTSOutput(tokenFilter.create(tokenizer),
+                expected_tokens_katakana);
+
+        tokenFilter = analysisService.tokenFilter("kuromoji_ks");
+        MatcherAssert.assertThat(tokenFilter,
+                instanceOf(KuromojiKatakanaStemmerFactory.class));
+        tokenizer = new JapaneseTokenizer(new StringReader(source), null, true,
+                JapaneseTokenizer.Mode.SEARCH);
+
+        // パーティー should not be stemmed since min len == 6
+        // コピー should not be stemmed
+        expected_tokens_katakana = new String[] { "明後日", "パーティー", "に", "行く",
+                "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" };
+        assertSimpleTSOutput(tokenFilter.create(tokenizer),
+                expected_tokens_katakana);
+
+    }
+
+    public AnalysisService createAnalysisService() {
+        Index index = new Index("test");
+        Settings settings = settingsBuilder().loadFromClasspath(
+                "org/elasticsearch/index/analysis/kuromoji_analysis.json")
+                .build();
+        Injector parentInjector = new ModulesBuilder().add(
+                new SettingsModule(settings),
+                new EnvironmentModule(new Environment(settings)),
+                new IndicesAnalysisModule()).createInjector();
+        AnalysisModule analysisModule = new AnalysisModule(settings,
+                parentInjector.getInstance(IndicesAnalysisService.class));
+        Injector injector = new ModulesBuilder().add(
+                new IndexSettingsModule(index, settings),
+                new PluginsModule(settings, parentInjector
+                        .getInstance(PluginsService.class)),
+                new IndexNameModule(index), analysisModule)
+                .createChildInjector(parentInjector);
+
+        AnalysisService analysisService = injector
+                .getInstance(AnalysisService.class);
+        return analysisService;
+    }
+
+    public static void assertSimpleTSOutput(TokenStream stream,
+            String[] expected) throws IOException {
+        stream.reset();
+        CharTermAttribute termAttr = stream
+                .getAttribute(CharTermAttribute.class);
+        Assert.assertNotNull(termAttr);
+        int i = 0;
+        while (stream.incrementToken()) {
+            Assert.assertTrue(i < expected.length);
+            Assert.assertEquals(expected[i++], termAttr.toString(),
+                    "expected different term at index " + i);
+        }
+        Assert.assertEquals(i, expected.length, "not all tokens produced");
+    }
+
+}
--- a/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json
+++ b/src/test/java/org/elasticsearch/index/analysis/kuromoji_analysis.json
@ -0,0 +1,31 @@
+{
+    "index":{
+        "analysis":{
+            "filter":{
+                "kuromoji_rf":{
+                    "type":"kuromoji_readingform",
+                    "use_romaji" : "true"
+                },
+                "kuromoji_pos" : {
+                    "type": "kuromoji_part_of_speech",
+                    "enable_position_increment" : "false",
+                    "stoptags" : ["#  verb-main:", "動詞-自立"]
+                },
+                "kuromoji_ks" : {
+                    "type": "kuromoji_stemmer",
+                    "minimum_length" : 6
+                }
+                
+                
+            },
+            
+            "tokenizer" : {
+                "kuromoji" : {
+                   "type":"kuromoji_tokenizer"
+                }
+            
+            }
+            
+        }
+    }
+}
--- a/src/test/resources/es-plugin.properties
+++ b/src/test/resources/es-plugin.properties
@ -0,0 +1 @@
+plugin=org.elasticsearch.plugin.analysis.kuromoji.AnalysisKuromojiPlugin
				`@ -0,0 +1 @@`
				`plugin=org.elasticsearch.plugin.analysis.kuromoji.AnalysisKuromojiPlugin`