From e26f62a71e1825879f819d6fcc58f830bdfea9f1 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Thu, 29 Dec 2011 11:58:31 +0200 Subject: [PATCH] first! --- .gitignore | 7 + README.md | 16 ++ pom.xml | 137 ++++++++++++++++++ src/main/assemblies/plugin.xml | 26 ++++ .../SmartChineseAnalysisBinderProcessor.java | 40 +++++ .../SmartChineseAnalyzerProvider.java | 50 +++++++ .../SmartChineseSentenceTokenizerFactory.java | 45 ++++++ .../SmartChineseWordTokenFilterFactory.java | 43 ++++++ .../smartcn/AnalysisSmartChinesePlugin.java | 49 +++++++ src/main/resources/es-plugin.properties | 1 + .../SimpleSmartChineseAnalysisTests.java | 61 ++++++++ src/test/resources/log4j.properties | 5 + 12 files changed, 480 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 pom.xml create mode 100644 src/main/assemblies/plugin.xml create mode 100644 src/main/java/org/elasticsearch/index/analysis/SmartChineseAnalysisBinderProcessor.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/SmartChineseAnalyzerProvider.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/SmartChineseSentenceTokenizerFactory.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/SmartChineseWordTokenFilterFactory.java create mode 100644 src/main/java/org/elasticsearch/plugin/analysis/smartcn/AnalysisSmartChinesePlugin.java create mode 100644 src/main/resources/es-plugin.properties create mode 100644 src/test/java/org/elasticsearch/index/analysis/SimpleSmartChineseAnalysisTests.java create mode 100644 src/test/resources/log4j.properties diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000000..06a1e6fedb6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +/data +/work +/logs +/.idea +/target +.DS_Store +*.iml diff --git a/README.md b/README.md new file mode 100644 index 00000000000..2bd3abcfa31 --- /dev/null +++ b/README.md @@ -0,0 +1,16 @@ +ICU Analysis for ElasticSearch +================================== + +The Smart Chinese Analysis plugin integrates Lucene Smart Chinese analysis module into elasticsearch. + +In order to install the plugin, simply run: `bin/plugin -install elasticsearch/elasticsearch-smartcn-icu/1.0.0`. + + -------------------------------------------------- + | Smart Chinese Analysis Plugin | ElasticSearch | + -------------------------------------------------- + | master | 0.18 -> master | + -------------------------------------------------- + | 1.0.0 | 0.18 -> master | + -------------------------------------------------- + +The plugin includes the `smartcn` analyzer, `smartcn_sentence` tokenizer, and `smartcn_word` token filter. diff --git a/pom.xml b/pom.xml new file mode 100644 index 00000000000..330fb740f1a --- /dev/null +++ b/pom.xml @@ -0,0 +1,137 @@ + + + elasticsearch-analysis-smartcn + 4.0.0 + org.elasticsearch + elasticsearch-analysis-smartcn + 1.0.0-SNAPSHOT + jar + Smart Chinese Analysis for ElasticSearch + 2009 + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + scm:git:git@github.com:elasticsearch/elasticsearch-analysis-smartcn.git + scm:git:git@github.com:elasticsearch/elasticsearch-analysis-smartcn.git + + http://github.com/elasticsearch/elasticsearch-analysis-smartcn + + + + org.sonatype.oss + oss-parent + 7 + + + + 0.18.5 + + + + + + + + org.elasticsearch + elasticsearch + ${elasticsearch.version} + compile + + + + org.apache.lucene + lucene-smartcn + 3.5.0 + compile + + + + log4j + log4j + 1.2.16 + runtime + + + + org.testng + testng + 6.3.1 + test + + + + org.hamcrest + hamcrest-core + 1.3.RC2 + test + + + + org.hamcrest + hamcrest-library + 1.3.RC2 + test + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + 1.6 + 1.6 + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.11 + + + **/*Tests.java + + + + + org.apache.maven.plugins + maven-source-plugin + 2.1.2 + + + attach-sources + + jar + + + + + + maven-assembly-plugin + + ${project.build.directory}/releases/ + + ${basedir}/src/main/assemblies/plugin.xml + + + + + package + + single + + + + + + + \ No newline at end of file diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml new file mode 100644 index 00000000000..e41ab2ffb50 --- /dev/null +++ b/src/main/assemblies/plugin.xml @@ -0,0 +1,26 @@ + + + + + zip + + false + + + / + true + true + + org.elasticsearch:elasticsearch + + + + / + true + true + + org.apache.lucene:lucene-smartcn + + + + \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/SmartChineseAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/SmartChineseAnalysisBinderProcessor.java new file mode 100644 index 00000000000..a2a209f0b89 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/SmartChineseAnalysisBinderProcessor.java @@ -0,0 +1,40 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +/** + */ +public class SmartChineseAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { + + @Override + public void processAnalyzers(AnalyzersBindings analyzersBindings) { + analyzersBindings.processAnalyzer("smartcn", SmartChineseAnalyzerProvider.class); + } + + @Override + public void processTokenizers(TokenizersBindings tokenizersBindings) { + tokenizersBindings.processTokenizer("smartcn_sentence", SmartChineseSentenceTokenizerFactory.class); + } + + @Override + public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { + tokenFiltersBindings.processTokenFilter("smartcn_word", SmartChineseWordTokenFilterFactory.class); + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/SmartChineseAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/SmartChineseAnalyzerProvider.java new file mode 100644 index 00000000000..f1a3f4a2c5c --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/SmartChineseAnalyzerProvider.java @@ -0,0 +1,50 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.util.Set; + +/** + */ +public class SmartChineseAnalyzerProvider extends AbstractIndexAnalyzerProvider { + + private final SmartChineseAnalyzer analyzer; + + @Inject + public SmartChineseAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + Set stopWords = Analysis.parseStopWords(env, settings, SmartChineseAnalyzer.getDefaultStopSet(), version); + + analyzer = new SmartChineseAnalyzer(version, stopWords); + } + + @Override + public SmartChineseAnalyzer get() { + return this.analyzer; + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/SmartChineseSentenceTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/SmartChineseSentenceTokenizerFactory.java new file mode 100644 index 00000000000..cd27f52cca6 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/SmartChineseSentenceTokenizerFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.cn.smart.SentenceTokenizer; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +import java.io.Reader; + +/** + */ +public class SmartChineseSentenceTokenizerFactory extends AbstractTokenizerFactory { + + @Inject + public SmartChineseSentenceTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public Tokenizer create(Reader reader) { + return new SentenceTokenizer(reader); + } +} diff --git a/src/main/java/org/elasticsearch/index/analysis/SmartChineseWordTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/SmartChineseWordTokenFilterFactory.java new file mode 100644 index 00000000000..510f9bc1a11 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/SmartChineseWordTokenFilterFactory.java @@ -0,0 +1,43 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cn.smart.WordTokenFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + */ +public class SmartChineseWordTokenFilterFactory extends AbstractTokenFilterFactory { + + @Inject + public SmartChineseWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new WordTokenFilter(tokenStream); + } +} diff --git a/src/main/java/org/elasticsearch/plugin/analysis/smartcn/AnalysisSmartChinesePlugin.java b/src/main/java/org/elasticsearch/plugin/analysis/smartcn/AnalysisSmartChinesePlugin.java new file mode 100644 index 00000000000..45d9396f9e8 --- /dev/null +++ b/src/main/java/org/elasticsearch/plugin/analysis/smartcn/AnalysisSmartChinesePlugin.java @@ -0,0 +1,49 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.plugin.analysis.smartcn; + +import org.elasticsearch.common.inject.Module; +import org.elasticsearch.index.analysis.AnalysisModule; +import org.elasticsearch.index.analysis.SmartChineseAnalysisBinderProcessor; +import org.elasticsearch.plugins.AbstractPlugin; + +/** + * + */ +public class AnalysisSmartChinesePlugin extends AbstractPlugin { + + @Override + public String name() { + return "analysis-smartcn"; + } + + @Override + public String description() { + return "Smart Chinese analysis support"; + } + + @Override + public void processModule(Module module) { + if (module instanceof AnalysisModule) { + AnalysisModule analysisModule = (AnalysisModule) module; + analysisModule.addProcessor(new SmartChineseAnalysisBinderProcessor()); + } + } +} diff --git a/src/main/resources/es-plugin.properties b/src/main/resources/es-plugin.properties new file mode 100644 index 00000000000..e1102b99970 --- /dev/null +++ b/src/main/resources/es-plugin.properties @@ -0,0 +1 @@ +plugin=org.elasticsearch.plugin.analysis.smartcn.AnalysisSmartChinesePlugin diff --git a/src/test/java/org/elasticsearch/index/analysis/SimpleSmartChineseAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/SimpleSmartChineseAnalysisTests.java new file mode 100644 index 00000000000..32624dc4780 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/SimpleSmartChineseAnalysisTests.java @@ -0,0 +1,61 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.hamcrest.MatcherAssert; +import org.testng.annotations.Test; + +import static org.elasticsearch.common.settings.ImmutableSettings.Builder.EMPTY_SETTINGS; +import static org.hamcrest.Matchers.instanceOf; + +/** + */ +public class SimpleSmartChineseAnalysisTests { + + @Test + public void testDefaultsIcuAnalysis() { + Index index = new Index("test"); + + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(EMPTY_SETTINGS), new EnvironmentModule(new Environment(EMPTY_SETTINGS)), new IndicesAnalysisModule()).createInjector(); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(index, EMPTY_SETTINGS), + new IndexNameModule(index), + new AnalysisModule(EMPTY_SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new SmartChineseAnalysisBinderProcessor())) + .createChildInjector(parentInjector); + + AnalysisService analysisService = injector.getInstance(AnalysisService.class); + + TokenizerFactory tokenizerFactory = analysisService.tokenizer("smartcn_sentence"); + MatcherAssert.assertThat(tokenizerFactory, instanceOf(SmartChineseSentenceTokenizerFactory.class)); + + TokenFilterFactory filterFactory = analysisService.tokenFilter("smartcn_word"); + MatcherAssert.assertThat(filterFactory, instanceOf(SmartChineseWordTokenFilterFactory.class)); + } +} diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties new file mode 100644 index 00000000000..497c97f9959 --- /dev/null +++ b/src/test/resources/log4j.properties @@ -0,0 +1,5 @@ +log4j.rootLogger=INFO, out + +log4j.appender.out=org.apache.log4j.ConsoleAppender +log4j.appender.out.layout=org.apache.log4j.PatternLayout +log4j.appender.out.layout.conversionPattern=[%d{ISO8601}][%-5p][%-25c] %m%n