Make PreConfiguredTokenFilter harder to misuse (#24572)
There are now three public static method to build instances of PreConfiguredTokenFilter and the ctor is private. I chose static methods instead of constructors because those allow us to change out the implementation returned if we so desire. Relates to #23658
This commit is contained in:
parent
d447b79e16
commit
65f2717ab7
|
@ -27,6 +27,7 @@ import org.elasticsearch.env.Environment;
|
|||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
|
||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.function.BiFunction;
|
||||
|
@ -36,31 +37,46 @@ import java.util.function.Function;
|
|||
* Provides pre-configured, shared {@link TokenFilter}s.
|
||||
*/
|
||||
public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider<TokenFilterFactory> {
|
||||
/**
|
||||
* Create a pre-configured token filter that may not vary at all.
|
||||
*/
|
||||
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
|
||||
Function<TokenStream, TokenStream> create) {
|
||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE,
|
||||
(tokenStream, version) -> create.apply(tokenStream));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a pre-configured token filter that may vary based on the Lucene version.
|
||||
*/
|
||||
public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
|
||||
BiFunction<TokenStream, org.apache.lucene.util.Version, TokenStream> create) {
|
||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.LUCENE,
|
||||
(tokenStream, version) -> create.apply(tokenStream, version.luceneVersion));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
|
||||
*/
|
||||
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
|
||||
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
|
||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH,
|
||||
(tokenStream, version) -> create.apply(tokenStream, version));
|
||||
}
|
||||
|
||||
private final String name;
|
||||
private final boolean useFilterForMultitermQueries;
|
||||
private final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
|
||||
private final BiFunction<TokenStream, Version, TokenStream> create;
|
||||
|
||||
/**
|
||||
* Standard ctor with all the power.
|
||||
*/
|
||||
public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
|
||||
PreBuiltCacheFactory.CachingStrategy cachingStrategy, BiFunction<TokenStream, Version, TokenStream> create) {
|
||||
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
|
||||
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
|
||||
this.name = name;
|
||||
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
|
||||
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
|
||||
this.cache = PreBuiltCacheFactory.getCache(cache);
|
||||
this.create = create;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience ctor for token streams that don't vary based on version.
|
||||
*/
|
||||
public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
|
||||
PreBuiltCacheFactory.CachingStrategy cachingStrategy, Function<TokenStream, TokenStream> create) {
|
||||
this(name, useFilterForMultitermQueries, cachingStrategy, (input, version) -> create.apply(input));
|
||||
// TODO why oh why aren't these all CachingStrategy.ONE? They *can't* vary based on version because they don't get it, right?!
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
|
||||
return getTokenFilterFactory(Version.indexCreated(settings));
|
||||
|
|
|
@ -272,10 +272,8 @@ public final class AnalysisModule {
|
|||
NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");
|
||||
|
||||
// Add filters available in lucene-core
|
||||
preConfiguredTokenFilters.register("lowercase",
|
||||
new PreConfiguredTokenFilter("lowercase", true, CachingStrategy.LUCENE, LowerCaseFilter::new));
|
||||
preConfiguredTokenFilters.register("standard",
|
||||
new PreConfiguredTokenFilter("standard", false, CachingStrategy.LUCENE, StandardFilter::new));
|
||||
preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new));
|
||||
preConfiguredTokenFilters.register("standard", PreConfiguredTokenFilter.singleton("standard", false, StandardFilter::new));
|
||||
/* Note that "stop" is available in lucene-core but it's pre-built
|
||||
* version uses a set of English stop words that are in
|
||||
* lucene-analyzers-common so "stop" is defined in the analysis-common
|
||||
|
@ -288,9 +286,12 @@ public final class AnalysisModule {
|
|||
// This has been migrated but has to stick around until PreBuiltTokenizers is removed.
|
||||
continue;
|
||||
default:
|
||||
if (CachingStrategy.ONE != preBuilt.getCachingStrategy()) {
|
||||
throw new UnsupportedOperationException("shim not available for " + preBuilt.getCachingStrategy());
|
||||
}
|
||||
String name = preBuilt.name().toLowerCase(Locale.ROOT);
|
||||
preConfiguredTokenFilters.register(name,
|
||||
new PreConfiguredTokenFilter(name, preBuilt.isMultiTermAware(), preBuilt.getCachingStrategy(), preBuilt::create));
|
||||
preConfiguredTokenFilters.register(name, PreConfiguredTokenFilter.singleton(name, preBuilt.isMultiTermAware(),
|
||||
tokenStream -> preBuilt.create(tokenStream, Version.CURRENT)));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
|||
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
||||
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
|
||||
import org.apache.lucene.analysis.de.GermanStemFilter;
|
||||
import org.apache.lucene.analysis.en.PorterStemFilter;
|
||||
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
|
||||
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
|
||||
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
|
||||
|
@ -70,20 +69,6 @@ public enum PreBuiltTokenFilters {
|
|||
},
|
||||
|
||||
// Extended Token Filters
|
||||
SNOWBALL(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new SnowballFilter(tokenStream, "English");
|
||||
}
|
||||
},
|
||||
|
||||
STEMMER(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
return new PorterStemFilter(tokenStream);
|
||||
}
|
||||
},
|
||||
|
||||
ELISION(CachingStrategy.ONE) {
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.elasticsearch.index.IndexSettings;
|
|||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
|
||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.IndexSettingsModule;
|
||||
|
@ -207,12 +206,11 @@ public class AnalysisRegistryTests extends ESTestCase {
|
|||
|
||||
public void testPreConfiguredTokenFiltersAreCached() throws IOException {
|
||||
AtomicBoolean built = new AtomicBoolean(false);
|
||||
PreConfiguredTokenFilter assertsBuiltOnce = new PreConfiguredTokenFilter("asserts_built_once", false,
|
||||
PreBuiltCacheFactory.CachingStrategy.ONE, (tokens, version) -> {
|
||||
PreConfiguredTokenFilter assertsBuiltOnce = PreConfiguredTokenFilter.singleton("asserts_built_once", false, tokenStream -> {
|
||||
if (false == built.compareAndSet(false, true)) {
|
||||
fail("Attempted to build the token filter twice when it should have been cached");
|
||||
}
|
||||
return new MockTokenFilter(tokens, MockTokenFilter.EMPTY_STOPSET);
|
||||
return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET);
|
||||
});
|
||||
try (AnalysisRegistry registryWithPreBuiltTokenFilter = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(),
|
||||
emptyMap(), emptyMap(), singletonMap("asserts_built_once", assertsBuiltOnce))) {
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||
|
@ -113,7 +112,7 @@ public class CustomNormalizerTests extends ESTokenStreamTestCase {
|
|||
private static class MockAnalysisPlugin implements AnalysisPlugin {
|
||||
@Override
|
||||
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
|
||||
return singletonList(new PreConfiguredTokenFilter("mock_forbidden", false, CachingStrategy.ONE, MockLowerCaseFilter::new));
|
||||
return singletonList(PreConfiguredTokenFilter.singleton("mock_forbidden", false, MockLowerCaseFilter::new));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -32,7 +32,6 @@ import org.elasticsearch.common.xcontent.XContentType;
|
|||
import org.elasticsearch.index.IndexService;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||
import org.elasticsearch.index.mapper.MapperService.MergeReason;
|
||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
import org.elasticsearch.test.ESSingleNodeTestCase;
|
||||
|
@ -55,7 +54,7 @@ public class KeywordFieldMapperTests extends ESSingleNodeTestCase {
|
|||
public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
||||
@Override
|
||||
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
|
||||
return singletonList(new PreConfiguredTokenFilter("mock_other_lowercase", true, CachingStrategy.ONE, MockLowerCaseFilter::new));
|
||||
return singletonList(PreConfiguredTokenFilter.singleton("mock_other_lowercase", true, MockLowerCaseFilter::new));
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
package org.elasticsearch.indices.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||
|
@ -28,6 +29,7 @@ import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
|
|||
import org.apache.lucene.analysis.hunspell.Dictionary;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.SimpleFSDirectory;
|
||||
import org.elasticsearch.Version;
|
||||
|
@ -43,6 +45,7 @@ import org.elasticsearch.index.analysis.CharFilterFactory;
|
|||
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
@ -61,17 +64,23 @@ import java.io.StringReader;
|
|||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import static java.util.Collections.singletonList;
|
||||
import static java.util.Collections.singletonMap;
|
||||
import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
|
||||
import static org.hamcrest.Matchers.either;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
import static org.hamcrest.Matchers.is;
|
||||
|
||||
public class AnalysisModuleTests extends ESTestCase {
|
||||
private final Settings emptyNodeSettings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.build();
|
||||
|
||||
public IndexAnalyzers getIndexAnalyzers(Settings settings) throws IOException {
|
||||
return getIndexAnalyzers(getNewRegistry(settings), settings);
|
||||
|
@ -264,6 +273,71 @@ public class AnalysisModuleTests extends ESTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that plugins can register pre-configured token filters that vary in behavior based on Elasticsearch version, Lucene version,
|
||||
* and that do not vary based on version at all.
|
||||
*/
|
||||
public void testPluginPreConfiguredTokenFilters() throws IOException {
|
||||
// Simple token filter that appends text to the term
|
||||
final class AppendTokenFilter extends TokenFilter {
|
||||
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
|
||||
private final char[] appendMe;
|
||||
|
||||
protected AppendTokenFilter(TokenStream input, String appendMe) {
|
||||
super(input);
|
||||
this.appendMe = appendMe.toCharArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (false == input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
term.resizeBuffer(term.length() + appendMe.length);
|
||||
System.arraycopy(appendMe, 0, term.buffer(), term.length(), appendMe.length);
|
||||
term.setLength(term.length() + appendMe.length);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
boolean noVersionSupportsMultiTerm = randomBoolean();
|
||||
boolean luceneVersionSupportsMultiTerm = randomBoolean();
|
||||
boolean elasticsearchVersionSupportsMultiTerm = randomBoolean();
|
||||
AnalysisRegistry registry = new AnalysisModule(new Environment(emptyNodeSettings), singletonList(new AnalysisPlugin() {
|
||||
@Override
|
||||
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
|
||||
return Arrays.asList(
|
||||
PreConfiguredTokenFilter.singleton("no_version", noVersionSupportsMultiTerm,
|
||||
tokenStream -> new AppendTokenFilter(tokenStream, "no_version")),
|
||||
PreConfiguredTokenFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm,
|
||||
(tokenStream, luceneVersion) -> new AppendTokenFilter(tokenStream, luceneVersion.toString())),
|
||||
PreConfiguredTokenFilter.elasticsearchVersion("elasticsearch_version", elasticsearchVersionSupportsMultiTerm,
|
||||
(tokenStream, esVersion) -> new AppendTokenFilter(tokenStream, esVersion.toString()))
|
||||
);
|
||||
}
|
||||
})).getAnalysisRegistry();
|
||||
|
||||
Version version = VersionUtils.randomVersion(random());
|
||||
IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder()
|
||||
.put("index.analysis.analyzer.no_version.tokenizer", "keyword")
|
||||
.put("index.analysis.analyzer.no_version.filter", "no_version")
|
||||
.put("index.analysis.analyzer.lucene_version.tokenizer", "keyword")
|
||||
.put("index.analysis.analyzer.lucene_version.filter", "lucene_version")
|
||||
.put("index.analysis.analyzer.elasticsearch_version.tokenizer", "keyword")
|
||||
.put("index.analysis.analyzer.elasticsearch_version.filter", "elasticsearch_version")
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
|
||||
.build());
|
||||
assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] {"testno_version"});
|
||||
assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] {"test" + version.luceneVersion});
|
||||
assertTokenStreamContents(analyzers.get("elasticsearch_version").tokenStream("", "test"), new String[] {"test" + version});
|
||||
|
||||
assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""),
|
||||
analyzers.get("no_version").normalize("", "test").utf8ToString());
|
||||
assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""),
|
||||
analyzers.get("lucene_version").normalize("", "test").utf8ToString());
|
||||
assertEquals("test" + (elasticsearchVersionSupportsMultiTerm ? version.toString() : ""),
|
||||
analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString());
|
||||
}
|
||||
|
||||
public void testRegisterHunspellDictionary() throws Exception {
|
||||
Settings settings = Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
|
|
|
@ -36,13 +36,13 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
|
|||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.ClassicFilter;
|
||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
|
||||
|
@ -73,41 +73,40 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
|||
|
||||
@Override
|
||||
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
|
||||
// TODO we should revisit the caching strategies.
|
||||
List<PreConfiguredTokenFilter> filters = new ArrayList<>();
|
||||
filters.add(new PreConfiguredTokenFilter("asciifolding", true, CachingStrategy.ONE, input -> new ASCIIFoldingFilter(input)));
|
||||
filters.add(new PreConfiguredTokenFilter("classic", false, CachingStrategy.ONE, ClassicFilter::new));
|
||||
filters.add(new PreConfiguredTokenFilter("common_grams", false, CachingStrategy.LUCENE, input ->
|
||||
new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
|
||||
filters.add(new PreConfiguredTokenFilter("edge_ngram", false, CachingStrategy.LUCENE, input ->
|
||||
filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, input -> new ASCIIFoldingFilter(input)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
|
||||
input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input ->
|
||||
new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
|
||||
// TODO deprecate edgeNGram
|
||||
filters.add(new PreConfiguredTokenFilter("edgeNGram", false, CachingStrategy.LUCENE, input ->
|
||||
filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input ->
|
||||
new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
|
||||
filters.add(new PreConfiguredTokenFilter("kstem", false, CachingStrategy.ONE, KStemFilter::new));
|
||||
filters.add(new PreConfiguredTokenFilter("length", false, CachingStrategy.LUCENE, input ->
|
||||
filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
|
||||
new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless
|
||||
filters.add(new PreConfiguredTokenFilter("ngram", false, CachingStrategy.LUCENE, NGramTokenFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new));
|
||||
// TODO deprecate nGram
|
||||
filters.add(new PreConfiguredTokenFilter("nGram", false, CachingStrategy.LUCENE, NGramTokenFilter::new));
|
||||
filters.add(new PreConfiguredTokenFilter("porter_stem", false, CachingStrategy.ONE, PorterStemFilter::new));
|
||||
filters.add(new PreConfiguredTokenFilter("reverse", false, CachingStrategy.LUCENE, input -> new ReverseStringFilter(input)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("reverse", false, input -> new ReverseStringFilter(input)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
|
||||
// The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
|
||||
filters.add(new PreConfiguredTokenFilter("stop", false, CachingStrategy.LUCENE, input ->
|
||||
new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
|
||||
filters.add(new PreConfiguredTokenFilter("trim", false, CachingStrategy.LUCENE, TrimFilter::new));
|
||||
filters.add(new PreConfiguredTokenFilter("truncate", false, CachingStrategy.ONE, input ->
|
||||
new TruncateTokenFilter(input, 10)));
|
||||
filters.add(new PreConfiguredTokenFilter("unique", false, CachingStrategy.ONE, input -> new UniqueTokenFilter(input)));
|
||||
filters.add(new PreConfiguredTokenFilter("uppercase", true, CachingStrategy.LUCENE, UpperCaseFilter::new));
|
||||
filters.add(new PreConfiguredTokenFilter("word_delimiter", false, CachingStrategy.ONE, input ->
|
||||
filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("unique", false, input -> new UniqueTokenFilter(input)));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->
|
||||
new WordDelimiterFilter(input,
|
||||
WordDelimiterFilter.GENERATE_WORD_PARTS
|
||||
| WordDelimiterFilter.GENERATE_NUMBER_PARTS
|
||||
| WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
|
||||
| WordDelimiterFilter.SPLIT_ON_NUMERICS
|
||||
| WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
|
||||
filters.add(new PreConfiguredTokenFilter("word_delimiter_graph", false, CachingStrategy.ONE, input ->
|
||||
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input ->
|
||||
new WordDelimiterGraphFilter(input,
|
||||
WordDelimiterGraphFilter.GENERATE_WORD_PARTS
|
||||
| WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
|
||||
|
|
|
@ -19,7 +19,9 @@
|
|||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
|
||||
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
|
||||
|
||||
|
@ -77,6 +79,8 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
|||
filters.put("nGram", null);
|
||||
filters.put("porter_stem", null);
|
||||
filters.put("reverse", ReverseStringFilterFactory.class);
|
||||
filters.put("snowball", SnowballPorterFilterFactory.class);
|
||||
filters.put("stemmer", PorterStemFilterFactory.class);
|
||||
filters.put("stop", null);
|
||||
filters.put("trim", null);
|
||||
filters.put("truncate", null);
|
||||
|
|
|
@ -350,15 +350,11 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
|||
case LOWERCASE:
|
||||
// This has been migrated but has to stick around until PreBuiltTokenizers is removed.
|
||||
continue;
|
||||
case SNOWBALL:
|
||||
case DUTCH_STEM:
|
||||
case FRENCH_STEM:
|
||||
case RUSSIAN_STEM:
|
||||
luceneFactoryClass = SnowballPorterFilterFactory.class;
|
||||
break;
|
||||
case STEMMER:
|
||||
luceneFactoryClass = PorterStemFilterFactory.class;
|
||||
break;
|
||||
case DELIMITED_PAYLOAD_FILTER:
|
||||
luceneFactoryClass = org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class;
|
||||
break;
|
||||
|
|
Loading…
Reference in New Issue