From 343c80b100c982ff9103ee9a6c192fa756dda6f2 Mon Sep 17 00:00:00 2001 From: kimchy Date: Sun, 17 Apr 2011 03:55:33 +0300 Subject: [PATCH] shard tokenizers, token filters, char filters across indices / shards --- .../common/logging/ESLoggerFactory.java | 2 +- .../index/analysis/AnalysisModule.java | 26 +- .../index/analysis/CharFilterFactory.java | 3 +- .../analysis/LengthTokenFilterFactory.java | 2 +- ... => PreBuiltCharFilterFactoryFactory.java} | 23 +- .../PreBuiltTokenFilterFactoryFactory.java | 35 ++ .../PreBuiltTokenizerFactoryFactory.java | 35 ++ .../analysis/StandardHtmlStripAnalyzer.java | 52 ++ .../StandardHtmlStripAnalyzerProvider.java | 14 +- .../index/analysis/TokenFilterFactory.java | 3 +- .../index/analysis/TokenizerFactory.java | 3 +- .../analysis/IndicesAnalysisService.java | 478 +++++++++++++++++- 12 files changed, 633 insertions(+), 43 deletions(-) rename modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/{RussianLetterTokenizerFactory.java => PreBuiltCharFilterFactoryFactory.java} (54%) create mode 100644 modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java create mode 100644 modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactory.java create mode 100644 modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/StandardHtmlStripAnalyzer.java diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/common/logging/ESLoggerFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/common/logging/ESLoggerFactory.java index d3bd778b0e5..069269f5f61 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/logging/ESLoggerFactory.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/logging/ESLoggerFactory.java @@ -57,7 +57,7 @@ public abstract class ESLoggerFactory { public static ESLogger getLogger(String prefix, String name) { - return defaultFactory.newInstance(prefix.intern(), name.intern()); + return defaultFactory.newInstance(prefix == null ? null : prefix.intern(), name.intern()); } public static ESLogger getLogger(String name) { diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index b3d9cdf05b3..1c9c83fe619 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -80,15 +80,21 @@ public class AnalysisModule extends AbstractModule { public static class TokenizersBindings { private final MapBinder binder; private final Map groupSettings; + private final IndicesAnalysisService indicesAnalysisService; - public TokenizersBindings(MapBinder binder, Map groupSettings) { + public TokenizersBindings(MapBinder binder, Map groupSettings, IndicesAnalysisService indicesAnalysisService) { this.binder = binder; this.groupSettings = groupSettings; + this.indicesAnalysisService = indicesAnalysisService; } public void processTokenizer(String name, Class tokenizerFactory) { if (!groupSettings.containsKey(name)) { - binder.addBinding(name).toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, tokenizerFactory)).in(Scopes.SINGLETON); + if (indicesAnalysisService != null && indicesAnalysisService.hasTokenizer(name)) { + binder.addBinding(name).toInstance(indicesAnalysisService.tokenizerFactoryFactory(name)); + } else { + binder.addBinding(name).toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, tokenizerFactory)).in(Scopes.SINGLETON); + } } } } @@ -190,7 +196,11 @@ public class AnalysisModule extends AbstractModule { continue; } // register it as default under the name - charFilterBinder.addBinding(charFilterName).toProvider(FactoryProvider.newFactory(CharFilterFactoryFactory.class, clazz)).in(Scopes.SINGLETON); + if (indicesAnalysisService != null && indicesAnalysisService.hasCharFilter(charFilterName)) { + charFilterBinder.addBinding(charFilterName).toInstance(indicesAnalysisService.charFilterFactoryFactory(charFilterName)); + } else { + charFilterBinder.addBinding(charFilterName).toProvider(FactoryProvider.newFactory(CharFilterFactoryFactory.class, clazz)).in(Scopes.SINGLETON); + } } @@ -237,7 +247,11 @@ public class AnalysisModule extends AbstractModule { continue; } // register it as default under the name - tokenFilterBinder.addBinding(tokenFilterName).toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, clazz)).in(Scopes.SINGLETON); + if (indicesAnalysisService != null && indicesAnalysisService.hasTokenFilter(tokenFilterName)) { + tokenFilterBinder.addBinding(tokenFilterName).toInstance(indicesAnalysisService.tokenFilterFactoryFactory(tokenFilterName)); + } else { + tokenFilterBinder.addBinding(tokenFilterName).toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, clazz)).in(Scopes.SINGLETON); + } } // TOKENIZER @@ -257,7 +271,7 @@ public class AnalysisModule extends AbstractModule { tokenizerBinder.addBinding(tokenizerName).toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, type)).in(Scopes.SINGLETON); } - AnalysisBinderProcessor.TokenizersBindings tokenizersBindings = new AnalysisBinderProcessor.TokenizersBindings(tokenizerBinder, tokenizersSettings); + AnalysisBinderProcessor.TokenizersBindings tokenizersBindings = new AnalysisBinderProcessor.TokenizersBindings(tokenizerBinder, tokenizersSettings, indicesAnalysisService); for (AnalysisBinderProcessor processor : processors) { processor.processTokenizers(tokenizersBindings); } @@ -331,8 +345,6 @@ public class AnalysisModule extends AbstractModule { tokenizersBindings.processTokenizer("letter", LetterTokenizerFactory.class); tokenizersBindings.processTokenizer("lowercase", LowerCaseTokenizerFactory.class); tokenizersBindings.processTokenizer("whitespace", WhitespaceTokenizerFactory.class); - tokenizersBindings.processTokenizer("russian_letter", RussianLetterTokenizerFactory.class); - tokenizersBindings.processTokenizer("russianLetter", RussianLetterTokenizerFactory.class); tokenizersBindings.processTokenizer("nGram", NGramTokenizerFactory.class); tokenizersBindings.processTokenizer("ngram", NGramTokenizerFactory.class); diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CharFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CharFilterFactory.java index 253caab6984..1189b0f72a3 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CharFilterFactory.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CharFilterFactory.java @@ -20,12 +20,11 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.CharStream; -import org.elasticsearch.index.IndexComponent; /** * @author kimchy (shay.banon) */ -public interface CharFilterFactory extends IndexComponent { +public interface CharFilterFactory { String name(); diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/LengthTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/LengthTokenFilterFactory.java index 0d411a90790..fe38eaf0b8a 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/LengthTokenFilterFactory.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/LengthTokenFilterFactory.java @@ -40,7 +40,7 @@ public class LengthTokenFilterFactory extends AbstractTokenFilterFactory { super(index, indexSettings, name, settings); min = settings.getAsInt("min", 0); max = settings.getAsInt("max", Integer.MAX_VALUE); - enablePositionIncrements = settings.getAsBoolean("enabled_position_increments", false); + enablePositionIncrements = settings.getAsBoolean("enabled_position_increments", true); } @Override public TokenStream create(TokenStream tokenStream) { diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/RussianLetterTokenizerFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactory.java similarity index 54% rename from modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/RussianLetterTokenizerFactory.java rename to modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactory.java index 542fb0409b1..318f40a15ce 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/RussianLetterTokenizerFactory.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactory.java @@ -19,26 +19,17 @@ package org.elasticsearch.index.analysis; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.ru.RussianLetterTokenizer; -import org.elasticsearch.common.inject.Inject; -import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.Index; -import org.elasticsearch.index.settings.IndexSettings; -import java.io.Reader; +public class PreBuiltCharFilterFactoryFactory implements CharFilterFactoryFactory { -/** - * @author kimchy (shay.banon) - */ -public class RussianLetterTokenizerFactory extends AbstractTokenizerFactory { + private final CharFilterFactory charFilterFactory; - @Inject public RussianLetterTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { - super(index, indexSettings, name, settings); + public PreBuiltCharFilterFactoryFactory(CharFilterFactory charFilterFactory) { + this.charFilterFactory = charFilterFactory; } - @Override public Tokenizer create(Reader reader) { - return new RussianLetterTokenizer(reader); + @Override public CharFilterFactory create(String name, Settings settings) { + return charFilterFactory; } -} +} \ No newline at end of file diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java new file mode 100644 index 00000000000..cd095f9cf50 --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java @@ -0,0 +1,35 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.common.settings.Settings; + +public class PreBuiltTokenFilterFactoryFactory implements TokenFilterFactoryFactory { + + private final TokenFilterFactory tokenFilterFactory; + + public PreBuiltTokenFilterFactoryFactory(TokenFilterFactory tokenFilterFactory) { + this.tokenFilterFactory = tokenFilterFactory; + } + + @Override public TokenFilterFactory create(String name, Settings settings) { + return tokenFilterFactory; + } +} \ No newline at end of file diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactory.java new file mode 100644 index 00000000000..c9581096d1d --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactory.java @@ -0,0 +1,35 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.common.settings.Settings; + +public class PreBuiltTokenizerFactoryFactory implements TokenizerFactoryFactory { + + private final TokenizerFactory tokenizerFactory; + + public PreBuiltTokenizerFactoryFactory(TokenizerFactory tokenizerFactory) { + this.tokenizerFactory = tokenizerFactory; + } + + @Override public TokenizerFactory create(String name, Settings settings) { + return tokenizerFactory; + } +} \ No newline at end of file diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/StandardHtmlStripAnalyzer.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/StandardHtmlStripAnalyzer.java new file mode 100644 index 00000000000..1e75c23e418 --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/StandardHtmlStripAnalyzer.java @@ -0,0 +1,52 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.util.Version; + +import java.io.IOException; +import java.io.Reader; + +public class StandardHtmlStripAnalyzer extends StopwordAnalyzerBase { + + public StandardHtmlStripAnalyzer(Version version) { + super(version, StopAnalyzer.ENGLISH_STOP_WORDS_SET); + } + + @Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { + final StandardTokenizer src = new StandardTokenizer(matchVersion, reader); + src.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); + TokenStream tok = new StandardFilter(matchVersion, src); + tok = new LowerCaseFilter(matchVersion, tok); + tok = new StopFilter(matchVersion, tok, stopwords); + return new TokenStreamComponents(src, tok) { + @Override + protected boolean reset(final Reader reader) throws IOException { + src.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); + return super.reset(reader); + } + }; + } + +} \ No newline at end of file diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/StandardHtmlStripAnalyzerProvider.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/StandardHtmlStripAnalyzerProvider.java index 264a9435292..69f15db5de4 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/StandardHtmlStripAnalyzerProvider.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/StandardHtmlStripAnalyzerProvider.java @@ -28,22 +28,16 @@ import org.elasticsearch.index.settings.IndexSettings; /** * @author kimchy (shay.banon) */ -public class StandardHtmlStripAnalyzerProvider extends AbstractIndexAnalyzerProvider { +public class StandardHtmlStripAnalyzerProvider extends AbstractIndexAnalyzerProvider { - private final CustomAnalyzer analyzer; + private final StandardHtmlStripAnalyzer analyzer; @Inject public StandardHtmlStripAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); - analyzer = new CustomAnalyzer(new StandardTokenizerFactory(index, indexSettings, name, settings), - new CharFilterFactory[]{new HtmlStripCharFilterFactory(index, indexSettings, name, settings)}, - new TokenFilterFactory[]{ - new StandardTokenFilterFactory(index, indexSettings, name, settings), - new LowerCaseTokenFilterFactory(index, indexSettings, name, settings), - new StopTokenFilterFactory(index, indexSettings, name, settings) - }); + analyzer = new StandardHtmlStripAnalyzer(version); } - @Override public CustomAnalyzer get() { + @Override public StandardHtmlStripAnalyzer get() { return this.analyzer; } } diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java index 01c4bfa980c..0ec6a6ddbcd 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java @@ -20,12 +20,11 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.TokenStream; -import org.elasticsearch.index.IndexComponent; /** * @author kimchy (Shay Banon) */ -public interface TokenFilterFactory extends IndexComponent { +public interface TokenFilterFactory { String name(); diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/TokenizerFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/TokenizerFactory.java index c5e01cf0ad7..f018d785b90 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/TokenizerFactory.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/TokenizerFactory.java @@ -20,14 +20,13 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.Tokenizer; -import org.elasticsearch.index.IndexComponent; import java.io.Reader; /** * @author kimchy (Shay Banon) */ -public interface TokenizerFactory extends IndexComponent { +public interface TokenizerFactory { String name(); diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java b/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java index 14badf62227..9d25b4a3773 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java @@ -21,25 +21,47 @@ package org.elasticsearch.indices.analysis; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.ar.ArabicAnalyzer; +import org.apache.lucene.analysis.ar.ArabicStemFilter; import org.apache.lucene.analysis.br.BrazilianAnalyzer; +import org.apache.lucene.analysis.br.BrazilianStemFilter; import org.apache.lucene.analysis.cn.ChineseAnalyzer; import org.apache.lucene.analysis.cz.CzechAnalyzer; +import org.apache.lucene.analysis.cz.CzechStemFilter; import org.apache.lucene.analysis.de.GermanAnalyzer; +import org.apache.lucene.analysis.de.GermanStemFilter; import org.apache.lucene.analysis.el.GreekAnalyzer; import org.apache.lucene.analysis.fa.PersianAnalyzer; import org.apache.lucene.analysis.fr.FrenchAnalyzer; +import org.apache.lucene.analysis.fr.FrenchStemFilter; +import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; +import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.analysis.nl.DutchAnalyzer; +import org.apache.lucene.analysis.nl.DutchStemFilter; +import org.apache.lucene.analysis.path.PathHierarchyTokenizer; +import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.analysis.ru.RussianAnalyzer; +import org.apache.lucene.analysis.ru.RussianStemFilter; +import org.apache.lucene.analysis.shingle.ShingleFilter; +import org.apache.lucene.analysis.snowball.SnowballAnalyzer; +import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer; import org.apache.lucene.analysis.th.ThaiAnalyzer; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.lucene.analysis.HTMLStripCharFilter; +import org.elasticsearch.common.regex.Regex; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.concurrent.ConcurrentCollections; -import org.elasticsearch.index.analysis.AnalyzerScope; -import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory; +import org.elasticsearch.index.analysis.*; +import java.io.Reader; import java.util.Map; import static org.elasticsearch.common.settings.ImmutableSettings.Builder.*; @@ -53,6 +75,10 @@ public class IndicesAnalysisService extends AbstractComponent { private final Map analyzerProviderFactories = ConcurrentCollections.newConcurrentMap(); + private final Map tokenizerFactories = ConcurrentCollections.newConcurrentMap(); + private final Map tokenFilterFactories = ConcurrentCollections.newConcurrentMap(); + private final Map charFilterFactories = ConcurrentCollections.newConcurrentMap(); + public IndicesAnalysisService() { super(EMPTY_SETTINGS); } @@ -69,6 +95,10 @@ public class IndicesAnalysisService extends AbstractComponent { analyzerProviderFactories.put("simple", new PreBuiltAnalyzerProviderFactory("simple", AnalyzerScope.INDICES, new SimpleAnalyzer(Lucene.ANALYZER_VERSION))); // extended ones + analyzerProviderFactories.put("pattern", new PreBuiltAnalyzerProviderFactory("pattern", AnalyzerScope.INDICES, new PatternAnalyzer(Lucene.ANALYZER_VERSION, Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); + analyzerProviderFactories.put("snowball", new PreBuiltAnalyzerProviderFactory("snowball", AnalyzerScope.INDICES, new SnowballAnalyzer(Lucene.ANALYZER_VERSION, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET))); + analyzerProviderFactories.put("standard_html_strip", new PreBuiltAnalyzerProviderFactory("standard_html_strip", AnalyzerScope.INDICES, new StandardHtmlStripAnalyzer(Lucene.ANALYZER_VERSION))); + analyzerProviderFactories.put("standardHtmlStrip", new PreBuiltAnalyzerProviderFactory("standardHtmlStrip", AnalyzerScope.INDICES, new StandardHtmlStripAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put("arabic", new PreBuiltAnalyzerProviderFactory("arabic", AnalyzerScope.INDICES, new ArabicAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put("brazilian", new PreBuiltAnalyzerProviderFactory("brazilian", AnalyzerScope.INDICES, new BrazilianAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put("chinese", new PreBuiltAnalyzerProviderFactory("chinese", AnalyzerScope.INDICES, new ChineseAnalyzer())); @@ -81,6 +111,450 @@ public class IndicesAnalysisService extends AbstractComponent { analyzerProviderFactories.put("persian", new PreBuiltAnalyzerProviderFactory("persian", AnalyzerScope.INDICES, new PersianAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put("russian", new PreBuiltAnalyzerProviderFactory("russian", AnalyzerScope.INDICES, new RussianAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put("thai", new PreBuiltAnalyzerProviderFactory("thai", AnalyzerScope.INDICES, new ThaiAnalyzer(Lucene.ANALYZER_VERSION))); + + // Base Tokenizers + tokenizerFactories.put("standard", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "standard"; + } + + @Override public Tokenizer create(Reader reader) { + return new StandardTokenizer(Lucene.ANALYZER_VERSION, reader); + } + })); + + tokenizerFactories.put("uax_url_email", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "uax_url_email"; + } + + @Override public Tokenizer create(Reader reader) { + return new UAX29URLEmailTokenizer(reader); + } + })); + + tokenizerFactories.put("uaxUrlEmail", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "uaxUrlEmail"; + } + + @Override public Tokenizer create(Reader reader) { + return new UAX29URLEmailTokenizer(reader); + } + })); + + tokenizerFactories.put("path_hierarchy", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "path_hierarchy"; + } + + @Override public Tokenizer create(Reader reader) { + return new PathHierarchyTokenizer(reader); + } + })); + + tokenizerFactories.put("pathHierarchy", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "pathHierarchy"; + } + + @Override public Tokenizer create(Reader reader) { + return new PathHierarchyTokenizer(reader); + } + })); + + tokenizerFactories.put("keyword", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "keyword"; + } + + @Override public Tokenizer create(Reader reader) { + return new KeywordTokenizer(reader); + } + })); + + tokenizerFactories.put("letter", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "letter"; + } + + @Override public Tokenizer create(Reader reader) { + return new LetterTokenizer(Lucene.ANALYZER_VERSION, reader); + } + })); + + tokenizerFactories.put("lowercase", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "lowercase"; + } + + @Override public Tokenizer create(Reader reader) { + return new LowerCaseTokenizer(Lucene.ANALYZER_VERSION, reader); + } + })); + + tokenizerFactories.put("whitespace", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "whitespace"; + } + + @Override public Tokenizer create(Reader reader) { + return new WhitespaceTokenizer(Lucene.ANALYZER_VERSION, reader); + } + })); + + tokenizerFactories.put("nGram", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "nGram"; + } + + @Override public Tokenizer create(Reader reader) { + return new NGramTokenizer(reader); + } + })); + + tokenizerFactories.put("ngram", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "ngram"; + } + + @Override public Tokenizer create(Reader reader) { + return new NGramTokenizer(reader); + } + })); + + tokenizerFactories.put("edgeNGram", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "edgeNGram"; + } + + @Override public Tokenizer create(Reader reader) { + return new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.DEFAULT_SIDE, EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); + } + })); + + tokenizerFactories.put("edge_ngram", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { + @Override public String name() { + return "edge_ngram"; + } + + @Override public Tokenizer create(Reader reader) { + return new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.DEFAULT_SIDE, EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); + } + })); + + // Token Filters + tokenFilterFactories.put("stop", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "stop"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new StopFilter(Lucene.ANALYZER_VERSION, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); + } + })); + + tokenFilterFactories.put("reverse", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "reverse"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new ReverseStringFilter(Lucene.ANALYZER_VERSION, tokenStream); + } + })); + + tokenFilterFactories.put("asciifolding", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "asciifolding"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new ASCIIFoldingFilter(tokenStream); + } + })); + + tokenFilterFactories.put("length", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "length"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new LengthFilter(true, tokenStream, 0, Integer.MAX_VALUE); + } + })); + + tokenFilterFactories.put("lowercase", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "lowercase"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new LowerCaseFilter(Lucene.ANALYZER_VERSION, tokenStream); + } + })); + + tokenFilterFactories.put("porterStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "porterStem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new PorterStemFilter(tokenStream); + } + })); + + tokenFilterFactories.put("porter_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "porter_stem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new PorterStemFilter(tokenStream); + } + })); + + tokenFilterFactories.put("standard", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "standard"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new StandardFilter(Lucene.ANALYZER_VERSION, tokenStream); + } + })); + + tokenFilterFactories.put("nGram", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "nGram"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new NGramTokenFilter(tokenStream); + } + })); + + tokenFilterFactories.put("ngram", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "ngram"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new NGramTokenFilter(tokenStream); + } + })); + + tokenFilterFactories.put("edgeNGram", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "edgeNGram"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new EdgeNGramTokenFilter(tokenStream, EdgeNGramTokenFilter.DEFAULT_SIDE, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); + } + })); + + tokenFilterFactories.put("edge_ngram", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "edge_ngram"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new EdgeNGramTokenFilter(tokenStream, EdgeNGramTokenFilter.DEFAULT_SIDE, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); + } + })); + + tokenFilterFactories.put("shingle", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "shingle"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new ShingleFilter(tokenStream, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); + } + })); + + // Extended Token Filters + tokenFilterFactories.put("snowball", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "snowball"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new SnowballFilter(tokenStream, "English"); + } + })); + tokenFilterFactories.put("arabicStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "arabicStem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new ArabicStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("arabic_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "arabic_stem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new ArabicStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("brazilianStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "brazilianStem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new BrazilianStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("brazilian_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "brazilian_stem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new BrazilianStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("czechStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "czechStem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new CzechStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("czech_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "czech_stem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new CzechStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("dutchStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "dutchStem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new DutchStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("dutch_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "dutch_stem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new DutchStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("frenchStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "frenchStem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new FrenchStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("french_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "french_stem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new FrenchStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("germanStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "germanStem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new GermanStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("german_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "german_stem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new GermanStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("russianStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "russianStem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new RussianStemFilter(tokenStream); + } + })); + tokenFilterFactories.put("russian_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "russian_stem"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new RussianStemFilter(tokenStream); + } + })); + + // Char Filter + charFilterFactories.put("html_strip", new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() { + @Override public String name() { + return "html_strip"; + } + + @Override public CharStream create(CharStream tokenStream) { + return new HTMLStripCharFilter(tokenStream); + } + })); + + charFilterFactories.put("htmlStrip", new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() { + @Override public String name() { + return "htmlStrip"; + } + + @Override public CharStream create(CharStream tokenStream) { + return new HTMLStripCharFilter(tokenStream); + } + })); + } + + public boolean hasCharFilter(String name) { + return charFilterFactoryFactory(name) != null; + } + + public CharFilterFactoryFactory charFilterFactoryFactory(String name) { + return charFilterFactories.get(name); + } + + public boolean hasTokenFilter(String name) { + return tokenFilterFactoryFactory(name) != null; + } + + public TokenFilterFactoryFactory tokenFilterFactoryFactory(String name) { + return tokenFilterFactories.get(name); + } + + public boolean hasTokenizer(String name) { + return tokenizerFactoryFactory(name) != null; + } + + public TokenizerFactoryFactory tokenizerFactoryFactory(String name) { + return tokenizerFactories.get(name); } public PreBuiltAnalyzerProviderFactory analyzerProviderFactory(String name) {