Analysis ICU Plugin, closes #151

This commit is contained in:
kimchy 2010-04-27 23:54:30 +03:00
parent dfd002bf98
commit 11e4ad9bd6
32 changed files with 1097 additions and 152 deletions

View File

@ -21,6 +21,7 @@
<entry name="?*.yml" />
<entry name="?*.txt" />
<entry name="?*.pdf" />
<entry name="?*.nrm" />
</wildcardResourcePatterns>
<annotationProcessing enabled="false" useClasspath="true" />
</component>

View File

@ -69,6 +69,7 @@
<w>throwable</w>
<w>tika</w>
<w>timestamp</w>
<w>tokenizers</w>
<w>translog</w>
<w>traslog</w>
<w>trie</w>

View File

@ -5,6 +5,7 @@
<module fileurl="file://$PROJECT_DIR$/.idea/modules//benchmark-micro.iml" filepath="$PROJECT_DIR$/.idea/modules//benchmark-micro.iml" />
<module fileurl="file://$PROJECT_DIR$/.idea/modules/elasticsearch.iml" filepath="$PROJECT_DIR$/.idea/modules/elasticsearch.iml" />
<module fileurl="file://$PROJECT_DIR$/.idea/modules/elasticsearch-root.iml" filepath="$PROJECT_DIR$/.idea/modules/elasticsearch-root.iml" />
<module fileurl="file://$PROJECT_DIR$/.idea/modules/plugin-analysis-icu.iml" filepath="$PROJECT_DIR$/.idea/modules/plugin-analysis-icu.iml" />
<module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-client-groovy.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-client-groovy.iml" />
<module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-discovery-jgroups.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-discovery-jgroups.iml" />
<module fileurl="file://$PROJECT_DIR$/.idea/modules//plugin-mapper-attachments.iml" filepath="$PROJECT_DIR$/.idea/modules//plugin-mapper-attachments.iml" />

View File

@ -17,6 +17,7 @@
<orderEntry type="module" module-name="plugin-discovery-jgroups" />
<orderEntry type="module" module-name="plugin-mapper-attachments" />
<orderEntry type="module" module-name="plugin-transport-memcached" />
<orderEntry type="module" module-name="plugin-analysis-icu" />
<orderEntry type="module" module-name="test-integration" />
</component>
</module>

View File

@ -0,0 +1,54 @@
<?xml version="1.0" encoding="UTF-8"?>
<module version="4">
<component name="NewModuleRootManager" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../plugins/analysis/icu/build/classes/main" />
<output-test url="file://$MODULE_DIR$/../../plugins/analysis/icu/build/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$/../../plugins/analysis/icu">
<sourceFolder url="file://$MODULE_DIR$/../../plugins/analysis/icu/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/../../plugins/analysis/icu/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/../../plugins/analysis/icu/build" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="module" module-name="elasticsearch" />
<orderEntry type="module-library">
<library name="icu4j">
<CLASSES>
<root url="jar://$GRADLE_REPOSITORY$/com.ibm.icu/icu4j/jars/icu4j-4.4.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES>
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/tests/charset/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/tests/localespi/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/classes/translit/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/classes/langdata/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/tests/collate/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/classes/charset/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/classes/collate/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/tests/translit/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/tests/core/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/classes/core/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/classes/regiondata/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/tests/packaging/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/tests/framework/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/classes/currdata/src" />
<root url="file://$MODULE_DIR$/../../../../../opt/icu4j/4.4/main/classes/localespi/src" />
</SOURCES>
</library>
</orderEntry>
<orderEntry type="module" module-name="test-testng" />
<orderEntry type="library" name="testng" level="project" />
<orderEntry type="library" name="hamcrest" level="project" />
<orderEntry type="module-library">
<library name="lucene-collation">
<CLASSES>
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-collation/jars/lucene-collation-3.0.1.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</orderEntry>
</component>
</module>

View File

@ -21,9 +21,13 @@ package org.elasticsearch.env;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.util.Classes;
import org.elasticsearch.util.io.Streams;
import org.elasticsearch.util.settings.Settings;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
@ -122,6 +126,10 @@ public class Environment {
return logsFile;
}
public String resolveConfigAndLoadToString(String path) throws FailedToResolveConfigException, IOException {
return Streams.copyToString(new InputStreamReader(resolveConfig(path).openStream(), "UTF-8"));
}
public URL resolveConfig(String path) throws FailedToResolveConfigException {
// first, try it as a path on the file system
File f1 = new File(path);

View File

@ -19,23 +19,46 @@
package org.elasticsearch.index.analysis;
import org.elasticsearch.util.gcommon.collect.Lists;
import org.elasticsearch.util.guice.inject.AbstractModule;
import org.elasticsearch.util.guice.inject.Scopes;
import org.elasticsearch.util.guice.inject.assistedinject.FactoryProvider;
import org.elasticsearch.util.guice.inject.multibindings.MapBinder;
import org.elasticsearch.util.settings.Settings;
import java.util.List;
import java.util.Map;
/**
* @author kimchy (Shay Banon)
* @author kimchy (shay.banon)
*/
public class AnalysisModule extends AbstractModule {
public static interface AnalysisBinderProcessor {
void processTokenFilters(MapBinder<String, TokenFilterFactoryFactory> binder, Map<String, Settings> groupSettings);
void processTokenizers(MapBinder<String, TokenizerFactoryFactory> binder, Map<String, Settings> groupSettings);
void processAnalyzers(MapBinder<String, AnalyzerProviderFactory> binder, Map<String, Settings> groupSettings);
}
private final Settings settings;
private final List<AnalysisBinderProcessor> processors = Lists.newArrayList();
public AnalysisModule(Settings settings) {
this.settings = settings;
processors.add(new DefaultProcessor());
try {
processors.add(new ExtendedProcessor());
} catch (Throwable t) {
// ignore. no extended ones
}
}
public AnalysisModule addProcessor(AnalysisBinderProcessor processor) {
processors.add(processor);
return this;
}
@Override protected void configure() {
@ -53,52 +76,9 @@ public class AnalysisModule extends AbstractModule {
}
tokenFilterBinder.addBinding(tokenFilterName).toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, type)).in(Scopes.SINGLETON);
}
// add defaults
if (!tokenFiltersSettings.containsKey("stop")) {
tokenFilterBinder.addBinding("stop").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, StopTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("asciifolding")) {
tokenFilterBinder.addBinding("asciifolding").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, ASCIIFoldingTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("length")) {
tokenFilterBinder.addBinding("length").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, LengthTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("lowercase")) {
tokenFilterBinder.addBinding("lowercase").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, LowerCaseTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("porterStem")) {
tokenFilterBinder.addBinding("porterStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, PorterStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("standard")) {
tokenFilterBinder.addBinding("standard").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, StandardTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("nGram")) {
tokenFilterBinder.addBinding("nGram").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, NGramTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("edgeNGram")) {
tokenFilterBinder.addBinding("edgeNGram").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, EdgeNGramTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("shingle")) {
tokenFilterBinder.addBinding("shingle").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, ShingleTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
// extends defaults
if (!tokenFiltersSettings.containsKey("arabicStem")) {
tokenFilterBinder.addBinding("arabicStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, ArabicStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("brazilianStem")) {
tokenFilterBinder.addBinding("brazilianStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, BrazilianStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("dutchStem")) {
tokenFilterBinder.addBinding("dutchStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, DutchStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("frenchStem")) {
tokenFilterBinder.addBinding("frenchStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, FrenchStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("germanStem")) {
tokenFilterBinder.addBinding("germanStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, GermanStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenFiltersSettings.containsKey("russianStem")) {
tokenFilterBinder.addBinding("russianStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, RussianStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
for (AnalysisBinderProcessor processor : processors) {
processor.processTokenFilters(tokenFilterBinder, tokenFiltersSettings);
}
MapBinder<String, TokenizerFactoryFactory> tokenizerBinder
@ -115,29 +95,10 @@ public class AnalysisModule extends AbstractModule {
}
tokenizerBinder.addBinding(tokenizerName).toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, type)).in(Scopes.SINGLETON);
}
// add defaults
if (!tokenizersSettings.containsKey("standard")) {
tokenizerBinder.addBinding("standard").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, StandardTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenizersSettings.containsKey("keyword")) {
tokenizerBinder.addBinding("keyword").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, KeywordTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenizersSettings.containsKey("letter")) {
tokenizerBinder.addBinding("letter").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, LetterTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenizersSettings.containsKey("lowercase")) {
tokenizerBinder.addBinding("lowercase").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, LowerCaseTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenizersSettings.containsKey("whitespace")) {
tokenizerBinder.addBinding("whitespace").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, WhitespaceTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenizersSettings.containsKey("nGram")) {
tokenizerBinder.addBinding("nGram").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, NGramTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!tokenizersSettings.containsKey("edgeNGram")) {
tokenizerBinder.addBinding("edgeNGram").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, EdgeNGramTokenizerFactory.class)).in(Scopes.SINGLETON);
}
for (AnalysisBinderProcessor processor : processors) {
processor.processTokenizers(tokenizerBinder, tokenizersSettings);
}
MapBinder<String, AnalyzerProviderFactory> analyzerBinder
= MapBinder.newMapBinder(binder(), String.class, AnalyzerProviderFactory.class);
@ -160,6 +121,184 @@ public class AnalysisModule extends AbstractModule {
analyzerBinder.addBinding(analyzerName).toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, type)).in(Scopes.SINGLETON);
}
for (AnalysisBinderProcessor processor : processors) {
processor.processAnalyzers(analyzerBinder, analyzersSettings);
}
bind(AnalysisService.class).in(Scopes.SINGLETON);
}
private static class DefaultProcessor implements AnalysisBinderProcessor {
@Override public void processTokenFilters(MapBinder<String, TokenFilterFactoryFactory> binder, Map<String, Settings> groupSettings) {
// add defaults
if (!groupSettings.containsKey("stop")) {
binder.addBinding("stop").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, StopTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("asciifolding")) {
binder.addBinding("asciifolding").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, ASCIIFoldingTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("length")) {
binder.addBinding("length").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, LengthTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("lowercase")) {
binder.addBinding("lowercase").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, LowerCaseTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("porterStem")) {
binder.addBinding("porterStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, PorterStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("porter_stem")) {
binder.addBinding("porter_stem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, PorterStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("standard")) {
binder.addBinding("standard").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, StandardTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("nGram")) {
binder.addBinding("nGram").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, NGramTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("ngram")) {
binder.addBinding("ngram").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, NGramTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("edgeNGram")) {
binder.addBinding("edgeNGram").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, EdgeNGramTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("edge_ngram")) {
binder.addBinding("edge_ngram").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, EdgeNGramTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("shingle")) {
binder.addBinding("shingle").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, ShingleTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
}
@Override public void processTokenizers(MapBinder<String, TokenizerFactoryFactory> binder, Map<String, Settings> groupSettings) {
// add defaults
if (!groupSettings.containsKey("standard")) {
binder.addBinding("standard").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, StandardTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("keyword")) {
binder.addBinding("keyword").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, KeywordTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("letter")) {
binder.addBinding("letter").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, LetterTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("lowercase")) {
binder.addBinding("lowercase").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, LowerCaseTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("whitespace")) {
binder.addBinding("whitespace").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, WhitespaceTokenizerFactory.class)).in(Scopes.SINGLETON);
}
}
@Override public void processAnalyzers(MapBinder<String, AnalyzerProviderFactory> binder, Map<String, Settings> groupSettings) {
if (!groupSettings.containsKey("standard")) {
binder.addBinding("standard").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, StandardAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("simple")) {
binder.addBinding("simple").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, SimpleAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("stop")) {
binder.addBinding("stop").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, StopAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("whitespace")) {
binder.addBinding("whitespace").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, WhitespaceAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("keyword")) {
binder.addBinding("keyword").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, KeywordAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
}
}
private static class ExtendedProcessor implements AnalysisBinderProcessor {
@Override public void processTokenFilters(MapBinder<String, TokenFilterFactoryFactory> binder, Map<String, Settings> groupSettings) {
if (!groupSettings.containsKey("arabicStem")) {
binder.addBinding("arabicStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, ArabicStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("arabic_stem")) {
binder.addBinding("arabic_stem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, ArabicStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("brazilianStem")) {
binder.addBinding("brazilianStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, BrazilianStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("brazilian_stem")) {
binder.addBinding("brazilian_stem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, BrazilianStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("dutchStem")) {
binder.addBinding("dutchStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, DutchStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("dutch_stem")) {
binder.addBinding("dutch_stem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, DutchStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("frenchStem")) {
binder.addBinding("frenchStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, FrenchStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("french_stem")) {
binder.addBinding("french_stem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, FrenchStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("germanStem")) {
binder.addBinding("germanStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, GermanStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("german_stem")) {
binder.addBinding("german_stem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, GermanStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("russianStem")) {
binder.addBinding("russianStem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, RussianStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("russian_stem")) {
binder.addBinding("russian_stem").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, RussianStemTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
}
@Override public void processTokenizers(MapBinder<String, TokenizerFactoryFactory> binder, Map<String, Settings> groupSettings) {
if (!groupSettings.containsKey("nGram")) {
binder.addBinding("nGram").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, NGramTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("ngram")) {
binder.addBinding("ngram").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, NGramTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("edgeNGram")) {
binder.addBinding("edgeNGram").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, EdgeNGramTokenizerFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("edge_ngram")) {
binder.addBinding("edge_ngram").toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, EdgeNGramTokenizerFactory.class)).in(Scopes.SINGLETON);
}
}
@Override public void processAnalyzers(MapBinder<String, AnalyzerProviderFactory> binder, Map<String, Settings> groupSettings) {
if (!groupSettings.containsKey("arabic")) {
binder.addBinding("arabic").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, ArabicAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("brazilian")) {
binder.addBinding("brazilian").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, BrazilianAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("chinese")) {
binder.addBinding("chinese").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, ChineseAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("cjk")) {
binder.addBinding("cjk").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, ChineseAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("czech")) {
binder.addBinding("czech").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, CzechAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("dutch")) {
binder.addBinding("dutch").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, DutchAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("french")) {
binder.addBinding("french").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, FrenchAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("german")) {
binder.addBinding("german").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, GermanAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("greek")) {
binder.addBinding("greek").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, GreekAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("persian")) {
binder.addBinding("persian").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, PersianAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("russian")) {
binder.addBinding("russian").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, RussianAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("thai")) {
binder.addBinding("thai").toProvider(FactoryProvider.newFactory(AnalyzerProviderFactory.class, ThaiAnalyzerProvider.class)).in(Scopes.SINGLETON);
}
}
}
}

View File

@ -74,22 +74,6 @@ public class AnalysisService extends AbstractIndexComponent implements Closeable
}
}
// add some defaults
if (!analyzerProviders.containsKey("standard")) {
analyzerProviders.put("standard", new StandardAnalyzerProvider(index, indexSettings, "standard", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("simple")) {
analyzerProviders.put("simple", new SimpleAnalyzerProvider(index, indexSettings, "simple", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("stop")) {
analyzerProviders.put("stop", new StopAnalyzerProvider(index, indexSettings, "stop", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("whitespace")) {
analyzerProviders.put("whitespace", new WhitespaceAnalyzerProvider(index, indexSettings, "whitespace", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("keyword")) {
analyzerProviders.put("keyword", new KeywordAnalyzerProvider(index, indexSettings, "keyword", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("default")) {
analyzerProviders.put("default", new StandardAnalyzerProvider(index, indexSettings, "default", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
@ -100,45 +84,6 @@ public class AnalysisService extends AbstractIndexComponent implements Closeable
analyzerProviders.put("default_search", analyzerProviders.get("default"));
}
// extended analyzers defaults
if (!analyzerProviders.containsKey("arabic")) {
analyzerProviders.put("arabic", new ArabicAnalyzerProvider(index, indexSettings, "arabic", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("brazilian")) {
analyzerProviders.put("brazilian", new BrazilianAnalyzerProvider(index, indexSettings, "brazilian", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("chinese")) {
analyzerProviders.put("chinese", new ChineseAnalyzerProvider(index, indexSettings, "chinese", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("cjk")) {
analyzerProviders.put("cjk", new ChineseAnalyzerProvider(index, indexSettings, "cjk", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("czech")) {
analyzerProviders.put("czech", new CzechAnalyzerProvider(index, indexSettings, "czech", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("dutch")) {
analyzerProviders.put("dutch", new DutchAnalyzerProvider(index, indexSettings, "dutch", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("french")) {
analyzerProviders.put("french", new FrenchAnalyzerProvider(index, indexSettings, "french", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("german")) {
analyzerProviders.put("german", new GermanAnalyzerProvider(index, indexSettings, "german", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("greek")) {
analyzerProviders.put("greek", new GreekAnalyzerProvider(index, indexSettings, "greek", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("persian")) {
analyzerProviders.put("persian", new PersianAnalyzerProvider(index, indexSettings, "persian", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("russian")) {
analyzerProviders.put("russian", new RussianAnalyzerProvider(index, indexSettings, "russian", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
if (!analyzerProviders.containsKey("thai")) {
analyzerProviders.put("thai", new ThaiAnalyzerProvider(index, indexSettings, "thai", ImmutableSettings.Builder.EMPTY_SETTINGS));
}
this.analyzerProviders = ImmutableMap.copyOf(analyzerProviders);
Map<String, NamedAnalyzer> analyzers = newHashMap();

View File

@ -20,6 +20,7 @@
package org.elasticsearch.index.service;
import org.elasticsearch.util.gcommon.collect.ImmutableMap;
import org.elasticsearch.util.gcommon.collect.Lists;
import org.elasticsearch.util.gcommon.collect.UnmodifiableIterator;
import org.elasticsearch.util.guice.inject.Inject;
import org.elasticsearch.util.guice.inject.Injector;
@ -56,9 +57,11 @@ import org.elasticsearch.plugins.PluginsService;
import org.elasticsearch.plugins.ShardsPluginsModule;
import org.elasticsearch.util.component.CloseableIndexComponent;
import org.elasticsearch.util.guice.Injectors;
import org.elasticsearch.util.guice.inject.Module;
import org.elasticsearch.util.settings.Settings;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -189,16 +192,20 @@ public class InternalIndexService extends AbstractIndexComponent implements Inde
logger.debug("Creating shard_id[{}]", shardId.id());
Injector shardInjector = injector.createChildInjector(
new ShardsPluginsModule(indexSettings, pluginsService),
new IndexShardModule(shardId),
new StoreModule(indexSettings),
new DeletionPolicyModule(indexSettings),
new MergePolicyModule(indexSettings),
new MergeSchedulerModule(indexSettings),
new TranslogModule(indexSettings),
new EngineModule(indexSettings),
new IndexShardGatewayModule(injector.getInstance(IndexGateway.class)));
List<Module> modules = Lists.newArrayList();
modules.add(new ShardsPluginsModule(indexSettings, pluginsService));
modules.add(new IndexShardModule(shardId));
modules.add(new StoreModule(indexSettings));
modules.add(new DeletionPolicyModule(indexSettings));
modules.add(new MergePolicyModule(indexSettings));
modules.add(new MergeSchedulerModule(indexSettings));
modules.add(new TranslogModule(indexSettings));
modules.add(new EngineModule(indexSettings));
modules.add(new IndexShardGatewayModule(injector.getInstance(IndexGateway.class)));
pluginsService.processModules(modules);
Injector shardInjector = injector.createChildInjector(modules);
shardsInjectors = newMapBuilder(shardsInjectors).put(shardId.id(), shardInjector).immutableMap();

View File

@ -47,8 +47,10 @@ import org.elasticsearch.util.component.AbstractLifecycleComponent;
import org.elasticsearch.util.component.CloseableIndexComponent;
import org.elasticsearch.util.concurrent.ThreadSafe;
import org.elasticsearch.util.guice.Injectors;
import org.elasticsearch.util.guice.inject.Module;
import org.elasticsearch.util.settings.Settings;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
@ -167,19 +169,23 @@ public class InternalIndicesService extends AbstractLifecycleComponent<IndicesSe
.globalSettings(settings.getGlobalSettings())
.build();
Injector indexInjector = injector.createChildInjector(
new IndexNameModule(index),
new LocalNodeIdModule(localNodeId),
new IndexSettingsModule(indexSettings),
new IndicesPluginsModule(indexSettings, pluginsService),
new AnalysisModule(indexSettings),
new SimilarityModule(indexSettings),
new IndexCacheModule(indexSettings),
new IndexQueryParserModule(indexSettings),
new MapperServiceModule(),
new IndexGatewayModule(indexSettings, injector.getInstance(Gateway.class)),
new OperationRoutingModule(indexSettings),
new IndexModule());
ArrayList<Module> modules = new ArrayList<Module>();
modules.add(new IndexNameModule(index));
modules.add(new LocalNodeIdModule(localNodeId));
modules.add(new IndexSettingsModule(indexSettings));
modules.add(new IndicesPluginsModule(indexSettings, pluginsService));
modules.add(new AnalysisModule(indexSettings));
modules.add(new SimilarityModule(indexSettings));
modules.add(new IndexCacheModule(indexSettings));
modules.add(new IndexQueryParserModule(indexSettings));
modules.add(new MapperServiceModule());
modules.add(new IndexGatewayModule(indexSettings, injector.getInstance(Gateway.class)));
modules.add(new OperationRoutingModule(indexSettings));
modules.add(new IndexModule());
pluginsService.processModules(modules);
Injector indexInjector = injector.createChildInjector(modules);
indicesInjectors.put(index.name(), indexInjector);

View File

@ -131,6 +131,7 @@ public final class InternalNode implements Node {
modules.add(new GatewayModule(settings));
modules.add(new NodeClientModule());
pluginsService.processModules(modules);
injector = Guice.createInjector(modules);

View File

@ -74,4 +74,8 @@ public abstract class AbstractPlugin implements Plugin {
@Override public Collection<Class<? extends CloseableIndexComponent>> shardServices() {
return ImmutableList.of();
}
@Override public void processModule(Module module) {
// nothing to do here
}
}

View File

@ -71,4 +71,6 @@ public interface Plugin {
* Per index shard service that will be automatically closed.
*/
Collection<Class<? extends CloseableIndexComponent>> shardServices();
void processModule(Module module);
}

View File

@ -70,6 +70,14 @@ public class PluginsService extends AbstractComponent {
return this.settings;
}
public void processModules(Iterable<Module> modules) {
for (Module module : modules) {
for (Plugin plugin : plugins.values()) {
plugin.processModule(module);
}
}
}
public Collection<Class<? extends Module>> modules() {
List<Class<? extends Module>> modules = Lists.newArrayList();
for (Plugin plugin : plugins.values()) {

View File

@ -0,0 +1,51 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.util.lucene.analysis;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* @author kimchy (shay.banon)
*/
// TODO Lucene Monitor: Once 3.1 is out, no need for this class anymore, use CharTermAttribute
public class CharSequenceTermAttribute implements CharSequence {
private final TermAttribute termAtt;
public CharSequenceTermAttribute(TermAttribute termAtt) {
this.termAtt = termAtt;
}
@Override public int length() {
return termAtt.termLength();
}
@Override public char charAt(int index) {
if (index >= length())
throw new IndexOutOfBoundsException();
return termAtt.termBuffer()[index];
}
@Override public CharSequence subSequence(int start, int end) {
if (start > length() || end > length())
throw new IndexOutOfBoundsException();
return new String(termAtt.termBuffer(), start, end - start);
}
}

View File

@ -0,0 +1,142 @@
dependsOn(':elasticsearch')
apply plugin: 'java'
apply plugin: 'maven'
archivesBaseName = "elasticsearch-analysis-icu"
explodedDistDir = new File(distsDir, 'exploded')
manifest.mainAttributes("Implementation-Title": "ElasticSearch::Plugins::Analysis::ICU", "Implementation-Version": rootProject.version, "Implementation-Date": buildTimeStr)
configurations.compile.transitive = true
configurations.testCompile.transitive = true
// no need to use the resource dir
sourceSets.main.resources.srcDirs 'src/main/java'
sourceSets.test.resources.srcDirs 'src/test/java'
// add the source files to the dist jar
//jar {
// from sourceSets.main.allJava
//}
configurations {
dists
distLib {
visible = false
transitive = false
}
}
dependencies {
compile project(':elasticsearch')
compile('com.ibm.icu:icu4j:4.4') { transitive = false }
distLib('com.ibm.icu:icu4j:4.4') { transitive = false }
compile('org.apache.lucene:lucene-collation:3.0.1') { transitive = false }
distLib('org.apache.lucene:lucene-collation:3.0.1') { transitive = false }
testCompile project(':test-testng')
testCompile('org.testng:testng:5.10:jdk15') { transitive = false }
testCompile 'org.hamcrest:hamcrest-all:1.1'
}
test {
useTestNG()
jmvArgs = ["-ea", "-Xmx1024m"]
suiteName = project.name
listeners = ["org.elasticsearch.util.testng.Listeners"]
systemProperties["es.test.log.conf"] = System.getProperty("es.test.log.conf", "log4j-gradle.properties")
}
task explodedDist(dependsOn: [jar], description: 'Builds the plugin zip file') << {
[explodedDistDir]*.mkdirs()
copy {
from configurations.distLib
into explodedDistDir
}
// remove elasticsearch files (compile above adds the elasticsearch one)
ant.delete { fileset(dir: explodedDistDir, includes: "elasticsearch-*.jar") }
copy {
from libsDir
into explodedDistDir
}
ant.delete { fileset(dir: explodedDistDir, includes: "elasticsearch-*-javadoc.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "elasticsearch-*-sources.jar") }
}
task zip(type: Zip, dependsOn: ['explodedDist']) {
from(explodedDistDir) {
}
}
task release(dependsOn: [zip]) << {
ant.delete(dir: explodedDistDir)
copy {
from distsDir
into(new File(rootProject.distsDir, "plugins"))
}
}
configurations {
deployerJars
}
dependencies {
deployerJars "org.apache.maven.wagon:wagon-http:1.0-beta-2"
}
task sourcesJar(type: Jar, dependsOn: classes) {
classifier = 'sources'
from sourceSets.main.allSource
}
task javadocJar(type: Jar, dependsOn: javadoc) {
classifier = 'javadoc'
from javadoc.destinationDir
}
artifacts {
archives sourcesJar
archives javadocJar
}
uploadArchives {
repositories.mavenDeployer {
configuration = configurations.deployerJars
repository(url: rootProject.mavenRepoUrl) {
authentication(userName: rootProject.mavenRepoUser, password: rootProject.mavenRepoPass)
}
snapshotRepository(url: rootProject.mavenSnapshotRepoUrl) {
authentication(userName: rootProject.mavenRepoUser, password: rootProject.mavenRepoPass)
}
pom.project {
inceptionYear '2009'
name 'elasticsearch-plugins-analysis-icu'
description 'Attachments Plugin for ElasticSearch'
licenses {
license {
name 'The Apache Software License, Version 2.0'
url 'http://www.apache.org/licenses/LICENSE-2.0.txt'
distribution 'repo'
}
}
scm {
connection 'git://github.com/elasticsearch/elasticsearch.git'
developerConnection 'git@github.com:elasticsearch/elasticsearch.git'
url 'http://github.com/elasticsearch/elasticsearch'
}
}
pom.whenConfigured {pom ->
pom.dependencies = pom.dependencies.findAll {dep -> dep.scope != 'test' } // removes the test scoped ones
}
}
}

View File

@ -0,0 +1 @@
plugin=org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin

View File

@ -0,0 +1,74 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import com.ibm.icu.text.Normalizer2;
import org.apache.lucene.analysis.TokenStream;
/**
* A TokenFilter that applies search term folding to Unicode text,
* applying foldings from UTR#30 Character Foldings.
* <p>
* This filter applies the following foldings from the report to unicode text:
* <ul>
* <li>Accent removal
* <li>Case folding
* <li>Canonical duplicates folding
* <li>Dashes folding
* <li>Diacritic removal (including stroke, hook, descender)
* <li>Greek letterforms folding
* <li>Han Radical folding
* <li>Hebrew Alternates folding
* <li>Jamo folding
* <li>Letterforms folding
* <li>Math symbol folding
* <li>Multigraph Expansions: All
* <li>Native digit folding
* <li>No-break folding
* <li>Overline folding
* <li>Positional forms folding
* <li>Small forms folding
* <li>Space folding
* <li>Spacing Accents folding
* <li>Subscript folding
* <li>Superscript folding
* <li>Suzhou Numeral folding
* <li>Symbol folding
* <li>Underline folding
* <li>Vertical forms folding
* <li>Width folding
* </ul>
* <p>
* Additionally, Default Ignorables are removed, and text is normalized to NFKC.
* All foldings, case folding, and normalization mappings are applied recursively
* to ensure a fully folded and normalized result.
* </p>
*/
public final class ICUFoldingFilter extends ICUNormalizer2Filter {
private static final Normalizer2 normalizer = Normalizer2.getInstance(ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
"utr30", Normalizer2.Mode.COMPOSE);
/**
* Create a new ICUFoldingFilter on the specified input
*/
public ICUFoldingFilter(TokenStream input) {
super(input, normalizer);
}
}

View File

@ -0,0 +1,101 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.Normalizer2;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.elasticsearch.util.lucene.analysis.CharSequenceTermAttribute;
import java.io.IOException;
/**
* Normalize token text with ICU's {@link com.ibm.icu.text.Normalizer2}
* <p>
* With this filter, you can normalize text in the following ways:
* <ul>
* <li> NFKC Normalization, Case Folding, and removing Ignorables (the default)
* <li> Using a standard Normalization mode (NFC, NFD, NFKC, NFKD)
* <li> Based on rules from a custom normalization mapping.
* </ul>
* <p>
* If you use the defaults, this filter is a simple way to standardize Unicode text
* in a language-independent way for search:
* <ul>
* <li> The case folding that it does can be seen as a replacement for
* LowerCaseFilter: For example, it handles cases such as the Greek sigma, so that
* "Μάϊος" and "ΜΆΪΟΣ" will match correctly.
* <li> The normalization will standardizes different forms of the same
* character in Unicode. For example, CJK full-width numbers will be standardized
* to their ASCII forms.
* <li> Ignorables such as Zero-Width Joiner and Variation Selectors are removed.
* These are typically modifier characters that affect display.
* </ul>
*
* @see com.ibm.icu.text.Normalizer2
* @see com.ibm.icu.text.FilteredNormalizer2
*/
// TODO Lucene Monitor: Once 3.1 is released use it instead
public class ICUNormalizer2Filter extends TokenFilter {
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
private final Normalizer2 normalizer;
private final StringBuilder buffer = new StringBuilder();
private final CharSequenceTermAttribute charSequenceTermAtt;
/**
* Create a new Normalizer2Filter that combines NFKC normalization, Case
* Folding, and removes Default Ignorables (NFKC_Casefold)
*/
public ICUNormalizer2Filter(TokenStream input) {
this(input, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
}
/**
* Create a new Normalizer2Filter with the specified Normalizer2
*
* @param input stream
* @param normalizer normalizer to use
*/
public ICUNormalizer2Filter(TokenStream input, Normalizer2 normalizer) {
super(input);
this.normalizer = normalizer;
this.charSequenceTermAtt = new CharSequenceTermAttribute(termAtt);
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (normalizer.quickCheck(charSequenceTermAtt) != Normalizer.YES) {
buffer.setLength(0);
normalizer.normalize(charSequenceTermAtt, buffer);
termAtt.setTermBuffer(buffer.toString());
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,62 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.elasticsearch.util.guice.inject.Scopes;
import org.elasticsearch.util.guice.inject.assistedinject.FactoryProvider;
import org.elasticsearch.util.guice.inject.multibindings.MapBinder;
import org.elasticsearch.util.settings.Settings;
import java.util.Map;
/**
* @author kimchy (shay.banon)
*/
public class IcuAnalysisBinderProcessor implements AnalysisModule.AnalysisBinderProcessor {
@Override public void processTokenFilters(MapBinder<String, TokenFilterFactoryFactory> binder, Map<String, Settings> groupSettings) {
if (!groupSettings.containsKey("icuNormalizer")) {
binder.addBinding("icuNormalizer").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, IcuNormalizerTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("icu_normalizer")) {
binder.addBinding("icu_normalizer").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, IcuNormalizerTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("icuFolding")) {
binder.addBinding("icuFolding").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, IcuFoldingTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("icu_folding")) {
binder.addBinding("icu_folding").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, IcuFoldingTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("icuCollation")) {
binder.addBinding("icuCollation").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, IcuCollationTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
if (!groupSettings.containsKey("icu_collation")) {
binder.addBinding("icu_collation").toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, IcuCollationTokenFilterFactory.class)).in(Scopes.SINGLETON);
}
}
@Override public void processTokenizers(MapBinder<String, TokenizerFactoryFactory> binder, Map<String, Settings> groupSettings) {
}
@Override public void processAnalyzers(MapBinder<String, AnalyzerProviderFactory> binder, Map<String, Settings> groupSettings) {
}
}

View File

@ -0,0 +1,104 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.collation.ICUCollationKeyFilter;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.FailedToResolveConfigException;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import org.elasticsearch.util.guice.inject.Inject;
import org.elasticsearch.util.guice.inject.assistedinject.Assisted;
import org.elasticsearch.util.settings.Settings;
import java.io.IOException;
import java.util.Locale;
/**
* An ICU based collation token filter. There are two ways to configure collation:
*
* <p>The first is simply specifying the locale (defaults to the default locale). The <tt>language</tt>
* parameter is the lowercase two-letter ISO-639 code. An additional <tt>country</tt> and <tt>variant</tt>
* can be provided.
*
* <p>The second option is to specify collation rules as defined in the <a href="http://www.icu-project.org/userguide/Collate_Customization.html">
* Collation customization</a> chapter in icu docs. The <tt>rules</tt> parameter can either embed the rules definition
* in the settings or refer to an external location (preferable located under the <tt>config</tt> location, relative to it).
*
* @author kimchy (shay.banon)
*/
public class IcuCollationTokenFilterFactory extends AbstractTokenFilterFactory {
private final Collator collator;
@Inject public IcuCollationTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment environment, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name);
Collator collator;
String rules = settings.get("rules");
if (rules != null) {
FailedToResolveConfigException failureToResolve = null;
try {
rules = environment.resolveConfigAndLoadToString(rules);
} catch (FailedToResolveConfigException e) {
failureToResolve = e;
} catch (IOException e) {
throw new ElasticSearchIllegalArgumentException("Failed to load collation rules", e);
}
try {
collator = new RuleBasedCollator(rules);
} catch (Exception e) {
if (failureToResolve != null) {
throw new ElasticSearchIllegalArgumentException("Failed to resolve collation rules location", failureToResolve);
} else {
throw new ElasticSearchIllegalArgumentException("Failed to parse collation rules", e);
}
}
} else {
String language = settings.get("language");
if (language != null) {
Locale locale;
String country = settings.get("country");
if (country != null) {
String variant = settings.get("variant");
if (variant != null) {
locale = new Locale(language, country, variant);
} else {
locale = new Locale(language, country);
}
} else {
locale = new Locale(language);
}
collator = Collator.getInstance(locale);
} else {
collator = Collator.getInstance();
}
}
this.collator = collator;
}
@Override public TokenStream create(TokenStream tokenStream) {
return new ICUCollationKeyFilter(tokenStream, collator);
}
}

View File

@ -0,0 +1,42 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import org.elasticsearch.util.guice.inject.Inject;
import org.elasticsearch.util.guice.inject.assistedinject.Assisted;
import org.elasticsearch.util.settings.Settings;
/**
* @author kimchy (shay.banon)
*/
public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory {
@Inject public IcuFoldingTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name);
}
@Override public TokenStream create(TokenStream tokenStream) {
return new ICUFoldingFilter(tokenStream);
}
}

View File

@ -0,0 +1,51 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import com.ibm.icu.text.Normalizer2;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import org.elasticsearch.util.guice.inject.Inject;
import org.elasticsearch.util.guice.inject.assistedinject.Assisted;
import org.elasticsearch.util.settings.Settings;
/**
* Uses the {@link org.elasticsearch.index.analysis.ICUNormalizer2Filter} to normalize tokens.
*
* <p>The <tt>name</tt> can be used to provide the type of normalization to perofrm.
*
* @author kimchy (shay.banon)
* @see org.elasticsearch.index.analysis.ICUNormalizer2Filter
*/
public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory {
private final String name;
@Inject public IcuNormalizerTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name);
this.name = settings.get("name", "nfkc_cf");
}
@Override public TokenStream create(TokenStream tokenStream) {
return new ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE));
}
}

View File

@ -0,0 +1,46 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.plugin.analysis.icu;
import org.elasticsearch.index.analysis.AnalysisModule;
import org.elasticsearch.index.analysis.IcuAnalysisBinderProcessor;
import org.elasticsearch.plugins.AbstractPlugin;
import org.elasticsearch.util.guice.inject.Module;
/**
* @author kimchy (shay.banon)
*/
public class AnalysisICUPlugin extends AbstractPlugin {
@Override public String name() {
return "analysis-icu";
}
@Override public String description() {
return "UTF related ICU analysis support";
}
@Override public void processModule(Module module) {
if (module instanceof AnalysisModule) {
AnalysisModule analysisModule = (AnalysisModule) module;
analysisModule.addProcessor(new IcuAnalysisBinderProcessor());
}
}
}

View File

@ -0,0 +1,42 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import com.ibm.icu.text.Normalizer2;
import org.hamcrest.MatcherAssert;
import org.hamcrest.Matchers;
import org.testng.annotations.Test;
import java.text.Normalizer;
/**
* @author kimchy (shay.banon)
*/
public class Normalizer2Tests {
@Test public void testNormalizer2() {
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
MatcherAssert.assertThat(normalizer.normalize("Jordania"), Matchers.equalTo("jordania"));
MatcherAssert.assertThat(normalizer.normalize("João"), Matchers.equalTo("joão"));
MatcherAssert.assertThat(Normalizer.normalize("Jordania", Normalizer.Form.NFKC), Matchers.equalTo("Jordania"));
MatcherAssert.assertThat(Normalizer.normalize("João", Normalizer.Form.NFKC), Matchers.equalTo("João"));
}
}

View File

@ -0,0 +1,50 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexNameModule;
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.util.guice.inject.Guice;
import org.elasticsearch.util.guice.inject.Injector;
import org.hamcrest.MatcherAssert;
import org.testng.annotations.Test;
import static org.elasticsearch.util.settings.ImmutableSettings.Builder.*;
import static org.hamcrest.Matchers.*;
/**
* @author kimchy (shay.banon)
*/
public class SimpleIcuAnalysisTests {
@Test public void testDefaultsIcuAnalysis() {
Index index = new Index("test");
Injector injector = Guice.createInjector(
new IndexSettingsModule(EMPTY_SETTINGS),
new IndexNameModule(index),
new AnalysisModule(EMPTY_SETTINGS).addProcessor(new IcuAnalysisBinderProcessor()));
AnalysisService analysisService = injector.getInstance(AnalysisService.class);
TokenFilterFactory filterFactory = analysisService.tokenFilter("icu_normalizer");
MatcherAssert.assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
}
}

View File

@ -117,7 +117,7 @@ uploadArchives {
pom.project {
inceptionYear '2009'
name 'elasticsearch-plugins-attachments'
name 'elasticsearch-plugins-mapper-attachments'
description 'Attachments Plugin for ElasticSearch'
licenses {
license {

View File

@ -1 +1 @@
plugin=org.elasticsearch.plugin.attachments.AttachmentsPlugin
plugin=org.elasticsearch.plugin.attachments.MapperAttachmentsPlugin

View File

@ -25,7 +25,7 @@ import org.elasticsearch.plugin.attachments.index.mapper.JsonAttachmentMapperSer
/**
* @author kimchy (shay.banon)
*/
public class AttachmentsIndexModule extends AbstractModule {
public class MapperAttachmentsIndexModule extends AbstractModule {
@Override protected void configure() {
bind(JsonAttachmentMapperService.class).asEagerSingleton();

View File

@ -29,7 +29,7 @@ import static org.elasticsearch.util.gcommon.collect.Lists.*;
/**
* @author kimchy (shay.banon)
*/
public class AttachmentsPlugin extends AbstractPlugin {
public class MapperAttachmentsPlugin extends AbstractPlugin {
@Override public String name() {
return "mapper-attachments";
@ -41,7 +41,7 @@ public class AttachmentsPlugin extends AbstractPlugin {
@Override public Collection<Class<? extends Module>> indexModules() {
Collection<Class<? extends Module>> modules = newArrayList();
modules.add(AttachmentsIndexModule.class);
modules.add(MapperAttachmentsIndexModule.class);
return modules;
}
}

View File

@ -6,6 +6,7 @@ include 'test-integration'
include 'benchmark-micro'
include 'plugins-analysis-icu'
include 'plugins-mapper-attachments'
include 'plugins-client-groovy'
include 'plugins-transport-memcached'