shard tokenizers, token filters, char filters across indices / shards
This commit is contained in:
parent
067b8379d5
commit
343c80b100
|
@ -57,7 +57,7 @@ public abstract class ESLoggerFactory {
|
||||||
|
|
||||||
|
|
||||||
public static ESLogger getLogger(String prefix, String name) {
|
public static ESLogger getLogger(String prefix, String name) {
|
||||||
return defaultFactory.newInstance(prefix.intern(), name.intern());
|
return defaultFactory.newInstance(prefix == null ? null : prefix.intern(), name.intern());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ESLogger getLogger(String name) {
|
public static ESLogger getLogger(String name) {
|
||||||
|
|
|
@ -80,15 +80,21 @@ public class AnalysisModule extends AbstractModule {
|
||||||
public static class TokenizersBindings {
|
public static class TokenizersBindings {
|
||||||
private final MapBinder<String, TokenizerFactoryFactory> binder;
|
private final MapBinder<String, TokenizerFactoryFactory> binder;
|
||||||
private final Map<String, Settings> groupSettings;
|
private final Map<String, Settings> groupSettings;
|
||||||
|
private final IndicesAnalysisService indicesAnalysisService;
|
||||||
|
|
||||||
public TokenizersBindings(MapBinder<String, TokenizerFactoryFactory> binder, Map<String, Settings> groupSettings) {
|
public TokenizersBindings(MapBinder<String, TokenizerFactoryFactory> binder, Map<String, Settings> groupSettings, IndicesAnalysisService indicesAnalysisService) {
|
||||||
this.binder = binder;
|
this.binder = binder;
|
||||||
this.groupSettings = groupSettings;
|
this.groupSettings = groupSettings;
|
||||||
|
this.indicesAnalysisService = indicesAnalysisService;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void processTokenizer(String name, Class<? extends TokenizerFactory> tokenizerFactory) {
|
public void processTokenizer(String name, Class<? extends TokenizerFactory> tokenizerFactory) {
|
||||||
if (!groupSettings.containsKey(name)) {
|
if (!groupSettings.containsKey(name)) {
|
||||||
binder.addBinding(name).toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, tokenizerFactory)).in(Scopes.SINGLETON);
|
if (indicesAnalysisService != null && indicesAnalysisService.hasTokenizer(name)) {
|
||||||
|
binder.addBinding(name).toInstance(indicesAnalysisService.tokenizerFactoryFactory(name));
|
||||||
|
} else {
|
||||||
|
binder.addBinding(name).toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, tokenizerFactory)).in(Scopes.SINGLETON);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -190,7 +196,11 @@ public class AnalysisModule extends AbstractModule {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// register it as default under the name
|
// register it as default under the name
|
||||||
charFilterBinder.addBinding(charFilterName).toProvider(FactoryProvider.newFactory(CharFilterFactoryFactory.class, clazz)).in(Scopes.SINGLETON);
|
if (indicesAnalysisService != null && indicesAnalysisService.hasCharFilter(charFilterName)) {
|
||||||
|
charFilterBinder.addBinding(charFilterName).toInstance(indicesAnalysisService.charFilterFactoryFactory(charFilterName));
|
||||||
|
} else {
|
||||||
|
charFilterBinder.addBinding(charFilterName).toProvider(FactoryProvider.newFactory(CharFilterFactoryFactory.class, clazz)).in(Scopes.SINGLETON);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -237,7 +247,11 @@ public class AnalysisModule extends AbstractModule {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// register it as default under the name
|
// register it as default under the name
|
||||||
tokenFilterBinder.addBinding(tokenFilterName).toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, clazz)).in(Scopes.SINGLETON);
|
if (indicesAnalysisService != null && indicesAnalysisService.hasTokenFilter(tokenFilterName)) {
|
||||||
|
tokenFilterBinder.addBinding(tokenFilterName).toInstance(indicesAnalysisService.tokenFilterFactoryFactory(tokenFilterName));
|
||||||
|
} else {
|
||||||
|
tokenFilterBinder.addBinding(tokenFilterName).toProvider(FactoryProvider.newFactory(TokenFilterFactoryFactory.class, clazz)).in(Scopes.SINGLETON);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TOKENIZER
|
// TOKENIZER
|
||||||
|
@ -257,7 +271,7 @@ public class AnalysisModule extends AbstractModule {
|
||||||
tokenizerBinder.addBinding(tokenizerName).toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, type)).in(Scopes.SINGLETON);
|
tokenizerBinder.addBinding(tokenizerName).toProvider(FactoryProvider.newFactory(TokenizerFactoryFactory.class, type)).in(Scopes.SINGLETON);
|
||||||
}
|
}
|
||||||
|
|
||||||
AnalysisBinderProcessor.TokenizersBindings tokenizersBindings = new AnalysisBinderProcessor.TokenizersBindings(tokenizerBinder, tokenizersSettings);
|
AnalysisBinderProcessor.TokenizersBindings tokenizersBindings = new AnalysisBinderProcessor.TokenizersBindings(tokenizerBinder, tokenizersSettings, indicesAnalysisService);
|
||||||
for (AnalysisBinderProcessor processor : processors) {
|
for (AnalysisBinderProcessor processor : processors) {
|
||||||
processor.processTokenizers(tokenizersBindings);
|
processor.processTokenizers(tokenizersBindings);
|
||||||
}
|
}
|
||||||
|
@ -331,8 +345,6 @@ public class AnalysisModule extends AbstractModule {
|
||||||
tokenizersBindings.processTokenizer("letter", LetterTokenizerFactory.class);
|
tokenizersBindings.processTokenizer("letter", LetterTokenizerFactory.class);
|
||||||
tokenizersBindings.processTokenizer("lowercase", LowerCaseTokenizerFactory.class);
|
tokenizersBindings.processTokenizer("lowercase", LowerCaseTokenizerFactory.class);
|
||||||
tokenizersBindings.processTokenizer("whitespace", WhitespaceTokenizerFactory.class);
|
tokenizersBindings.processTokenizer("whitespace", WhitespaceTokenizerFactory.class);
|
||||||
tokenizersBindings.processTokenizer("russian_letter", RussianLetterTokenizerFactory.class);
|
|
||||||
tokenizersBindings.processTokenizer("russianLetter", RussianLetterTokenizerFactory.class);
|
|
||||||
|
|
||||||
tokenizersBindings.processTokenizer("nGram", NGramTokenizerFactory.class);
|
tokenizersBindings.processTokenizer("nGram", NGramTokenizerFactory.class);
|
||||||
tokenizersBindings.processTokenizer("ngram", NGramTokenizerFactory.class);
|
tokenizersBindings.processTokenizer("ngram", NGramTokenizerFactory.class);
|
||||||
|
|
|
@ -20,12 +20,11 @@
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharStream;
|
import org.apache.lucene.analysis.CharStream;
|
||||||
import org.elasticsearch.index.IndexComponent;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author kimchy (shay.banon)
|
* @author kimchy (shay.banon)
|
||||||
*/
|
*/
|
||||||
public interface CharFilterFactory extends IndexComponent {
|
public interface CharFilterFactory {
|
||||||
|
|
||||||
String name();
|
String name();
|
||||||
|
|
||||||
|
|
|
@ -40,7 +40,7 @@ public class LengthTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
super(index, indexSettings, name, settings);
|
super(index, indexSettings, name, settings);
|
||||||
min = settings.getAsInt("min", 0);
|
min = settings.getAsInt("min", 0);
|
||||||
max = settings.getAsInt("max", Integer.MAX_VALUE);
|
max = settings.getAsInt("max", Integer.MAX_VALUE);
|
||||||
enablePositionIncrements = settings.getAsBoolean("enabled_position_increments", false);
|
enablePositionIncrements = settings.getAsBoolean("enabled_position_increments", true);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public TokenStream create(TokenStream tokenStream) {
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
|
|
@ -19,26 +19,17 @@
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
import org.apache.lucene.analysis.ru.RussianLetterTokenizer;
|
|
||||||
import org.elasticsearch.common.inject.Inject;
|
|
||||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.index.Index;
|
|
||||||
import org.elasticsearch.index.settings.IndexSettings;
|
|
||||||
|
|
||||||
import java.io.Reader;
|
public class PreBuiltCharFilterFactoryFactory implements CharFilterFactoryFactory {
|
||||||
|
|
||||||
/**
|
private final CharFilterFactory charFilterFactory;
|
||||||
* @author kimchy (shay.banon)
|
|
||||||
*/
|
|
||||||
public class RussianLetterTokenizerFactory extends AbstractTokenizerFactory {
|
|
||||||
|
|
||||||
@Inject public RussianLetterTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
public PreBuiltCharFilterFactoryFactory(CharFilterFactory charFilterFactory) {
|
||||||
super(index, indexSettings, name, settings);
|
this.charFilterFactory = charFilterFactory;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public Tokenizer create(Reader reader) {
|
@Override public CharFilterFactory create(String name, Settings settings) {
|
||||||
return new RussianLetterTokenizer(reader);
|
return charFilterFactory;
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elastic Search and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. Elastic Search licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
|
||||||
|
public class PreBuiltTokenFilterFactoryFactory implements TokenFilterFactoryFactory {
|
||||||
|
|
||||||
|
private final TokenFilterFactory tokenFilterFactory;
|
||||||
|
|
||||||
|
public PreBuiltTokenFilterFactoryFactory(TokenFilterFactory tokenFilterFactory) {
|
||||||
|
this.tokenFilterFactory = tokenFilterFactory;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenFilterFactory create(String name, Settings settings) {
|
||||||
|
return tokenFilterFactory;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elastic Search and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. Elastic Search licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
|
||||||
|
public class PreBuiltTokenizerFactoryFactory implements TokenizerFactoryFactory {
|
||||||
|
|
||||||
|
private final TokenizerFactory tokenizerFactory;
|
||||||
|
|
||||||
|
public PreBuiltTokenizerFactoryFactory(TokenizerFactory tokenizerFactory) {
|
||||||
|
this.tokenizerFactory = tokenizerFactory;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenizerFactory create(String name, Settings settings) {
|
||||||
|
return tokenizerFactory;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,52 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elastic Search and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. Elastic Search licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.*;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
public class StandardHtmlStripAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
|
||||||
|
public StandardHtmlStripAnalyzer(Version version) {
|
||||||
|
super(version, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
|
||||||
|
final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
|
||||||
|
src.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
TokenStream tok = new StandardFilter(matchVersion, src);
|
||||||
|
tok = new LowerCaseFilter(matchVersion, tok);
|
||||||
|
tok = new StopFilter(matchVersion, tok, stopwords);
|
||||||
|
return new TokenStreamComponents(src, tok) {
|
||||||
|
@Override
|
||||||
|
protected boolean reset(final Reader reader) throws IOException {
|
||||||
|
src.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
|
return super.reset(reader);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -28,22 +28,16 @@ import org.elasticsearch.index.settings.IndexSettings;
|
||||||
/**
|
/**
|
||||||
* @author kimchy (shay.banon)
|
* @author kimchy (shay.banon)
|
||||||
*/
|
*/
|
||||||
public class StandardHtmlStripAnalyzerProvider extends AbstractIndexAnalyzerProvider<CustomAnalyzer> {
|
public class StandardHtmlStripAnalyzerProvider extends AbstractIndexAnalyzerProvider<StandardHtmlStripAnalyzer> {
|
||||||
|
|
||||||
private final CustomAnalyzer analyzer;
|
private final StandardHtmlStripAnalyzer analyzer;
|
||||||
|
|
||||||
@Inject public StandardHtmlStripAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
@Inject public StandardHtmlStripAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||||
super(index, indexSettings, name, settings);
|
super(index, indexSettings, name, settings);
|
||||||
analyzer = new CustomAnalyzer(new StandardTokenizerFactory(index, indexSettings, name, settings),
|
analyzer = new StandardHtmlStripAnalyzer(version);
|
||||||
new CharFilterFactory[]{new HtmlStripCharFilterFactory(index, indexSettings, name, settings)},
|
|
||||||
new TokenFilterFactory[]{
|
|
||||||
new StandardTokenFilterFactory(index, indexSettings, name, settings),
|
|
||||||
new LowerCaseTokenFilterFactory(index, indexSettings, name, settings),
|
|
||||||
new StopTokenFilterFactory(index, indexSettings, name, settings)
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public CustomAnalyzer get() {
|
@Override public StandardHtmlStripAnalyzer get() {
|
||||||
return this.analyzer;
|
return this.analyzer;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,12 +20,11 @@
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.elasticsearch.index.IndexComponent;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author kimchy (Shay Banon)
|
* @author kimchy (Shay Banon)
|
||||||
*/
|
*/
|
||||||
public interface TokenFilterFactory extends IndexComponent {
|
public interface TokenFilterFactory {
|
||||||
|
|
||||||
String name();
|
String name();
|
||||||
|
|
||||||
|
|
|
@ -20,14 +20,13 @@
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.elasticsearch.index.IndexComponent;
|
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author kimchy (Shay Banon)
|
* @author kimchy (Shay Banon)
|
||||||
*/
|
*/
|
||||||
public interface TokenizerFactory extends IndexComponent {
|
public interface TokenizerFactory {
|
||||||
|
|
||||||
String name();
|
String name();
|
||||||
|
|
||||||
|
|
|
@ -21,25 +21,47 @@ package org.elasticsearch.indices.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.*;
|
import org.apache.lucene.analysis.*;
|
||||||
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
|
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
||||||
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
|
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
||||||
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
|
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
|
||||||
import org.apache.lucene.analysis.cz.CzechAnalyzer;
|
import org.apache.lucene.analysis.cz.CzechAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
||||||
import org.apache.lucene.analysis.de.GermanAnalyzer;
|
import org.apache.lucene.analysis.de.GermanAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.de.GermanStemFilter;
|
||||||
import org.apache.lucene.analysis.el.GreekAnalyzer;
|
import org.apache.lucene.analysis.el.GreekAnalyzer;
|
||||||
import org.apache.lucene.analysis.fa.PersianAnalyzer;
|
import org.apache.lucene.analysis.fa.PersianAnalyzer;
|
||||||
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
|
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.fr.FrenchStemFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||||
|
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||||
import org.apache.lucene.analysis.nl.DutchAnalyzer;
|
import org.apache.lucene.analysis.nl.DutchAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.nl.DutchStemFilter;
|
||||||
|
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
|
||||||
|
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||||
import org.apache.lucene.analysis.ru.RussianAnalyzer;
|
import org.apache.lucene.analysis.ru.RussianAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.ru.RussianStemFilter;
|
||||||
|
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||||
import org.apache.lucene.analysis.th.ThaiAnalyzer;
|
import org.apache.lucene.analysis.th.ThaiAnalyzer;
|
||||||
import org.elasticsearch.common.component.AbstractComponent;
|
import org.elasticsearch.common.component.AbstractComponent;
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.common.inject.Inject;
|
||||||
import org.elasticsearch.common.lucene.Lucene;
|
import org.elasticsearch.common.lucene.Lucene;
|
||||||
|
import org.elasticsearch.common.lucene.analysis.HTMLStripCharFilter;
|
||||||
|
import org.elasticsearch.common.regex.Regex;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
|
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
|
||||||
import org.elasticsearch.index.analysis.AnalyzerScope;
|
import org.elasticsearch.index.analysis.*;
|
||||||
import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory;
|
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import static org.elasticsearch.common.settings.ImmutableSettings.Builder.*;
|
import static org.elasticsearch.common.settings.ImmutableSettings.Builder.*;
|
||||||
|
@ -53,6 +75,10 @@ public class IndicesAnalysisService extends AbstractComponent {
|
||||||
|
|
||||||
private final Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = ConcurrentCollections.newConcurrentMap();
|
private final Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = ConcurrentCollections.newConcurrentMap();
|
||||||
|
|
||||||
|
private final Map<String, PreBuiltTokenizerFactoryFactory> tokenizerFactories = ConcurrentCollections.newConcurrentMap();
|
||||||
|
private final Map<String, PreBuiltTokenFilterFactoryFactory> tokenFilterFactories = ConcurrentCollections.newConcurrentMap();
|
||||||
|
private final Map<String, PreBuiltCharFilterFactoryFactory> charFilterFactories = ConcurrentCollections.newConcurrentMap();
|
||||||
|
|
||||||
public IndicesAnalysisService() {
|
public IndicesAnalysisService() {
|
||||||
super(EMPTY_SETTINGS);
|
super(EMPTY_SETTINGS);
|
||||||
}
|
}
|
||||||
|
@ -69,6 +95,10 @@ public class IndicesAnalysisService extends AbstractComponent {
|
||||||
analyzerProviderFactories.put("simple", new PreBuiltAnalyzerProviderFactory("simple", AnalyzerScope.INDICES, new SimpleAnalyzer(Lucene.ANALYZER_VERSION)));
|
analyzerProviderFactories.put("simple", new PreBuiltAnalyzerProviderFactory("simple", AnalyzerScope.INDICES, new SimpleAnalyzer(Lucene.ANALYZER_VERSION)));
|
||||||
|
|
||||||
// extended ones
|
// extended ones
|
||||||
|
analyzerProviderFactories.put("pattern", new PreBuiltAnalyzerProviderFactory("pattern", AnalyzerScope.INDICES, new PatternAnalyzer(Lucene.ANALYZER_VERSION, Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
|
||||||
|
analyzerProviderFactories.put("snowball", new PreBuiltAnalyzerProviderFactory("snowball", AnalyzerScope.INDICES, new SnowballAnalyzer(Lucene.ANALYZER_VERSION, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
|
||||||
|
analyzerProviderFactories.put("standard_html_strip", new PreBuiltAnalyzerProviderFactory("standard_html_strip", AnalyzerScope.INDICES, new StandardHtmlStripAnalyzer(Lucene.ANALYZER_VERSION)));
|
||||||
|
analyzerProviderFactories.put("standardHtmlStrip", new PreBuiltAnalyzerProviderFactory("standardHtmlStrip", AnalyzerScope.INDICES, new StandardHtmlStripAnalyzer(Lucene.ANALYZER_VERSION)));
|
||||||
analyzerProviderFactories.put("arabic", new PreBuiltAnalyzerProviderFactory("arabic", AnalyzerScope.INDICES, new ArabicAnalyzer(Lucene.ANALYZER_VERSION)));
|
analyzerProviderFactories.put("arabic", new PreBuiltAnalyzerProviderFactory("arabic", AnalyzerScope.INDICES, new ArabicAnalyzer(Lucene.ANALYZER_VERSION)));
|
||||||
analyzerProviderFactories.put("brazilian", new PreBuiltAnalyzerProviderFactory("brazilian", AnalyzerScope.INDICES, new BrazilianAnalyzer(Lucene.ANALYZER_VERSION)));
|
analyzerProviderFactories.put("brazilian", new PreBuiltAnalyzerProviderFactory("brazilian", AnalyzerScope.INDICES, new BrazilianAnalyzer(Lucene.ANALYZER_VERSION)));
|
||||||
analyzerProviderFactories.put("chinese", new PreBuiltAnalyzerProviderFactory("chinese", AnalyzerScope.INDICES, new ChineseAnalyzer()));
|
analyzerProviderFactories.put("chinese", new PreBuiltAnalyzerProviderFactory("chinese", AnalyzerScope.INDICES, new ChineseAnalyzer()));
|
||||||
|
@ -81,6 +111,450 @@ public class IndicesAnalysisService extends AbstractComponent {
|
||||||
analyzerProviderFactories.put("persian", new PreBuiltAnalyzerProviderFactory("persian", AnalyzerScope.INDICES, new PersianAnalyzer(Lucene.ANALYZER_VERSION)));
|
analyzerProviderFactories.put("persian", new PreBuiltAnalyzerProviderFactory("persian", AnalyzerScope.INDICES, new PersianAnalyzer(Lucene.ANALYZER_VERSION)));
|
||||||
analyzerProviderFactories.put("russian", new PreBuiltAnalyzerProviderFactory("russian", AnalyzerScope.INDICES, new RussianAnalyzer(Lucene.ANALYZER_VERSION)));
|
analyzerProviderFactories.put("russian", new PreBuiltAnalyzerProviderFactory("russian", AnalyzerScope.INDICES, new RussianAnalyzer(Lucene.ANALYZER_VERSION)));
|
||||||
analyzerProviderFactories.put("thai", new PreBuiltAnalyzerProviderFactory("thai", AnalyzerScope.INDICES, new ThaiAnalyzer(Lucene.ANALYZER_VERSION)));
|
analyzerProviderFactories.put("thai", new PreBuiltAnalyzerProviderFactory("thai", AnalyzerScope.INDICES, new ThaiAnalyzer(Lucene.ANALYZER_VERSION)));
|
||||||
|
|
||||||
|
// Base Tokenizers
|
||||||
|
tokenizerFactories.put("standard", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "standard";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new StandardTokenizer(Lucene.ANALYZER_VERSION, reader);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenizerFactories.put("uax_url_email", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "uax_url_email";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new UAX29URLEmailTokenizer(reader);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenizerFactories.put("uaxUrlEmail", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "uaxUrlEmail";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new UAX29URLEmailTokenizer(reader);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenizerFactories.put("path_hierarchy", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "path_hierarchy";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new PathHierarchyTokenizer(reader);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenizerFactories.put("pathHierarchy", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "pathHierarchy";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new PathHierarchyTokenizer(reader);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenizerFactories.put("keyword", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "keyword";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new KeywordTokenizer(reader);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenizerFactories.put("letter", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "letter";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new LetterTokenizer(Lucene.ANALYZER_VERSION, reader);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenizerFactories.put("lowercase", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "lowercase";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new LowerCaseTokenizer(Lucene.ANALYZER_VERSION, reader);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenizerFactories.put("whitespace", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "whitespace";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new WhitespaceTokenizer(Lucene.ANALYZER_VERSION, reader);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenizerFactories.put("nGram", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "nGram";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new NGramTokenizer(reader);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenizerFactories.put("ngram", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "ngram";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new NGramTokenizer(reader);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenizerFactories.put("edgeNGram", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "edgeNGram";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.DEFAULT_SIDE, EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenizerFactories.put("edge_ngram", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "edge_ngram";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public Tokenizer create(Reader reader) {
|
||||||
|
return new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.DEFAULT_SIDE, EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Token Filters
|
||||||
|
tokenFilterFactories.put("stop", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "stop";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new StopFilter(Lucene.ANALYZER_VERSION, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenFilterFactories.put("reverse", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "reverse";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new ReverseStringFilter(Lucene.ANALYZER_VERSION, tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenFilterFactories.put("asciifolding", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "asciifolding";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new ASCIIFoldingFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenFilterFactories.put("length", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "length";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new LengthFilter(true, tokenStream, 0, Integer.MAX_VALUE);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenFilterFactories.put("lowercase", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "lowercase";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new LowerCaseFilter(Lucene.ANALYZER_VERSION, tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenFilterFactories.put("porterStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "porterStem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new PorterStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenFilterFactories.put("porter_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "porter_stem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new PorterStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenFilterFactories.put("standard", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "standard";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new StandardFilter(Lucene.ANALYZER_VERSION, tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenFilterFactories.put("nGram", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "nGram";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new NGramTokenFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenFilterFactories.put("ngram", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "ngram";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new NGramTokenFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenFilterFactories.put("edgeNGram", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "edgeNGram";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new EdgeNGramTokenFilter(tokenStream, EdgeNGramTokenFilter.DEFAULT_SIDE, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenFilterFactories.put("edge_ngram", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "edge_ngram";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new EdgeNGramTokenFilter(tokenStream, EdgeNGramTokenFilter.DEFAULT_SIDE, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
tokenFilterFactories.put("shingle", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "shingle";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new ShingleFilter(tokenStream, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Extended Token Filters
|
||||||
|
tokenFilterFactories.put("snowball", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "snowball";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new SnowballFilter(tokenStream, "English");
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("arabicStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "arabicStem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new ArabicStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("arabic_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "arabic_stem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new ArabicStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("brazilianStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "brazilianStem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new BrazilianStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("brazilian_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "brazilian_stem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new BrazilianStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("czechStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "czechStem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new CzechStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("czech_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "czech_stem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new CzechStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("dutchStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "dutchStem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new DutchStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("dutch_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "dutch_stem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new DutchStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("frenchStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "frenchStem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new FrenchStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("french_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "french_stem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new FrenchStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("germanStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "germanStem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new GermanStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("german_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "german_stem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new GermanStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("russianStem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "russianStem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new RussianStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
tokenFilterFactories.put("russian_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "russian_stem";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new RussianStemFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Char Filter
|
||||||
|
charFilterFactories.put("html_strip", new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "html_strip";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public CharStream create(CharStream tokenStream) {
|
||||||
|
return new HTMLStripCharFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
charFilterFactories.put("htmlStrip", new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() {
|
||||||
|
@Override public String name() {
|
||||||
|
return "htmlStrip";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override public CharStream create(CharStream tokenStream) {
|
||||||
|
return new HTMLStripCharFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasCharFilter(String name) {
|
||||||
|
return charFilterFactoryFactory(name) != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CharFilterFactoryFactory charFilterFactoryFactory(String name) {
|
||||||
|
return charFilterFactories.get(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasTokenFilter(String name) {
|
||||||
|
return tokenFilterFactoryFactory(name) != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenFilterFactoryFactory tokenFilterFactoryFactory(String name) {
|
||||||
|
return tokenFilterFactories.get(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasTokenizer(String name) {
|
||||||
|
return tokenizerFactoryFactory(name) != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenizerFactoryFactory tokenizerFactoryFactory(String name) {
|
||||||
|
return tokenizerFactories.get(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
public PreBuiltAnalyzerProviderFactory analyzerProviderFactory(String name) {
|
public PreBuiltAnalyzerProviderFactory analyzerProviderFactory(String name) {
|
||||||
|
|
Loading…
Reference in New Issue