diff --git a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index b438cd5af41..5d099267c79 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -74,14 +74,15 @@ public final class AnalysisRegistry implements Closeable { Map> tokenizers, Map>> analyzers, Map>> normalizers, - Map preConfiguredTokenFilters) { + Map preConfiguredTokenFilters, + Map preConfiguredTokenizers) { this.environment = environment; this.charFilters = unmodifiableMap(charFilters); this.tokenFilters = unmodifiableMap(tokenFilters); this.tokenizers = unmodifiableMap(tokenizers); this.analyzers = unmodifiableMap(analyzers); this.normalizers = unmodifiableMap(normalizers); - prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters); + prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters, preConfiguredTokenizers); } /** @@ -169,12 +170,12 @@ public final class AnalysisRegistry implements Closeable { */ tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings))); tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings))); - return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.tokenFilterFactories); + return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters); } public Map buildTokenizerFactories(IndexSettings indexSettings) throws IOException { final Map tokenizersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_TOKENIZER); - return buildMapping(Component.TOKENIZER, indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.tokenizerFactories); + return buildMapping(Component.TOKENIZER, indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.preConfiguredTokenizers); } public Map buildCharFilterFactories(IndexSettings indexSettings) throws IOException { @@ -394,31 +395,22 @@ public final class AnalysisRegistry implements Closeable { private static class PrebuiltAnalysis implements Closeable { final Map>> analyzerProviderFactories; - final Map> tokenizerFactories; - final Map> tokenFilterFactories; + final Map> preConfiguredTokenFilters; + final Map> preConfiguredTokenizers; final Map> charFilterFactories; - private PrebuiltAnalysis(Map preConfiguredTokenFilters) { + private PrebuiltAnalysis( + Map preConfiguredTokenFilters, + Map preConfiguredTokenizers) { Map analyzerProviderFactories = new HashMap<>(); - Map tokenizerFactories = new HashMap<>(); Map charFilterFactories = new HashMap<>(); + // Analyzers for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) { String name = preBuiltAnalyzerEnum.name().toLowerCase(Locale.ROOT); analyzerProviderFactories.put(name, new PreBuiltAnalyzerProviderFactory(name, AnalyzerScope.INDICES, preBuiltAnalyzerEnum.getAnalyzer(Version.CURRENT))); } - // Tokenizers - for (PreBuiltTokenizers preBuiltTokenizer : PreBuiltTokenizers.values()) { - String name = preBuiltTokenizer.name().toLowerCase(Locale.ROOT); - tokenizerFactories.put(name, new PreBuiltTokenizerFactoryFactory(preBuiltTokenizer.getTokenizerFactory(Version.CURRENT))); - } - - // Tokenizer aliases - tokenizerFactories.put("nGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.NGRAM.getTokenizerFactory(Version.CURRENT))); - tokenizerFactories.put("edgeNGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.EDGE_NGRAM.getTokenizerFactory(Version.CURRENT))); - tokenizerFactories.put("PathHierarchy", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.PATH_HIERARCHY.getTokenizerFactory(Version.CURRENT))); - // Char Filters for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) { String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT); @@ -429,8 +421,8 @@ public final class AnalysisRegistry implements Closeable { this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories); this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories); - this.tokenizerFactories = Collections.unmodifiableMap(tokenizerFactories); - tokenFilterFactories = preConfiguredTokenFilters; + this.preConfiguredTokenFilters = preConfiguredTokenFilters; + this.preConfiguredTokenizers = preConfiguredTokenizers; } public AnalysisModule.AnalysisProvider getCharFilterFactory(String name) { @@ -438,11 +430,11 @@ public final class AnalysisRegistry implements Closeable { } public AnalysisModule.AnalysisProvider getTokenFilterFactory(String name) { - return tokenFilterFactories.get(name); + return preConfiguredTokenFilters.get(name); } public AnalysisModule.AnalysisProvider getTokenizerFactory(String name) { - return tokenizerFactories.get(name); + return preConfiguredTokenizers.get(name); } public AnalysisModule.AnalysisProvider> getAnalyzerProvider(String name) { diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactory.java deleted file mode 100644 index 02218bd7ceb..00000000000 --- a/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactory.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import org.elasticsearch.Version; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.env.Environment; -import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.indices.analysis.AnalysisModule; -import org.elasticsearch.indices.analysis.PreBuiltTokenizers; - -import java.io.IOException; - -public class PreBuiltTokenizerFactoryFactory implements AnalysisModule.AnalysisProvider { - - private final TokenizerFactory tokenizerFactory; - - public PreBuiltTokenizerFactoryFactory(TokenizerFactory tokenizerFactory) { - this.tokenizerFactory = tokenizerFactory; - } - - public TokenizerFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException { - Version indexVersion = Version.indexCreated(settings); - if (!Version.CURRENT.equals(indexVersion)) { - PreBuiltTokenizers preBuiltTokenizers = PreBuiltTokenizers.getOrDefault(name, null); - if (preBuiltTokenizers != null) { - return preBuiltTokenizers.getTokenizerFactory(indexVersion); - } - } - - return tokenizerFactory; - } -} diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredAnalysisComponent.java b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredAnalysisComponent.java new file mode 100644 index 00000000000..fdd525d0c80 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredAnalysisComponent.java @@ -0,0 +1,64 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.Version; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; + +import java.io.IOException; + +/** + * Shared implementation for pre-configured analysis components. + */ +public abstract class PreConfiguredAnalysisComponent implements AnalysisModule.AnalysisProvider { + private final String name; + private final PreBuiltCacheFactory.PreBuiltCache cache; + + protected PreConfiguredAnalysisComponent(String name, PreBuiltCacheFactory.CachingStrategy cache) { + this.name = name; + this.cache = PreBuiltCacheFactory.getCache(cache); + } + + @Override + public T get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException { + Version versionCreated = Version.indexCreated(settings); + synchronized (this) { + T factory = cache.get(versionCreated); + if (factory == null) { + factory = create(versionCreated); + cache.put(versionCreated, factory); + } + return factory; + } + } + + /** + * The name of the analysis component in the API. + */ + public String getName() { + return name; + } + + protected abstract T create(Version version); +} diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java index 1d9e4459c7e..777fb589c9d 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java @@ -22,21 +22,16 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.elasticsearch.Version; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.env.Environment; -import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; -import java.io.IOException; import java.util.function.BiFunction; import java.util.function.Function; /** * Provides pre-configured, shared {@link TokenFilter}s. */ -public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider { +public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisComponent { /** * Create a pre-configured token filter that may not vary at all. */ @@ -60,35 +55,19 @@ public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisPr */ public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries, BiFunction create) { - return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH, - (tokenStream, version) -> create.apply(tokenStream, version)); + return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH, create); } - private final String name; private final boolean useFilterForMultitermQueries; - private final PreBuiltCacheFactory.PreBuiltCache cache; private final BiFunction create; private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, PreBuiltCacheFactory.CachingStrategy cache, BiFunction create) { - this.name = name; + super(name, cache); this.useFilterForMultitermQueries = useFilterForMultitermQueries; - this.cache = PreBuiltCacheFactory.getCache(cache); this.create = create; } - @Override - public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException { - return getTokenFilterFactory(Version.indexCreated(settings)); - } - - /** - * The name of the {@link TokenFilter} in the API. - */ - public String getName() { - return name; - } - /** * Can this {@link TokenFilter} be used in multi-term queries? */ @@ -98,42 +77,36 @@ public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisPr private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {} - private synchronized TokenFilterFactory getTokenFilterFactory(final Version version) { - TokenFilterFactory factory = cache.get(version); - if (factory == null) { - if (useFilterForMultitermQueries) { - factory = new MultiTermAwareTokenFilterFactory() { - @Override - public String name() { - return name; - } + @Override + protected TokenFilterFactory create(Version version) { + if (useFilterForMultitermQueries) { + return new MultiTermAwareTokenFilterFactory() { + @Override + public String name() { + return getName(); + } - @Override - public TokenStream create(TokenStream tokenStream) { - return create.apply(tokenStream, version); - } + @Override + public TokenStream create(TokenStream tokenStream) { + return create.apply(tokenStream, version); + } - @Override - public Object getMultiTermComponent() { - return this; - } - }; - } else { - factory = new TokenFilterFactory() { - @Override - public String name() { - return name; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return create.apply(tokenStream, version); - } - }; - } - cache.put(version, factory); + @Override + public Object getMultiTermComponent() { + return this; + } + }; } + return new TokenFilterFactory() { + @Override + public String name() { + return getName(); + } - return factory; + @Override + public TokenStream create(TokenStream tokenStream) { + return create.apply(tokenStream, version); + } + }; } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenizer.java b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenizer.java new file mode 100644 index 00000000000..6d1842c7a36 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenizer.java @@ -0,0 +1,128 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.elasticsearch.Version; +import org.elasticsearch.common.Nullable; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; + +import java.util.function.Function; +import java.util.function.Supplier; + +/** + * Provides pre-configured, shared {@link Tokenizer}s. + */ +public final class PreConfiguredTokenizer extends PreConfiguredAnalysisComponent { + /** + * Create a pre-configured tokenizer that may not vary at all. + * + * @param name the name of the tokenizer in the api + * @param create builds the tokenizer + * @param multiTermComponent null if this tokenizer shouldn't be used for multi-term queries, otherwise a supplier for the + * {@link TokenFilterFactory} that stands in for this tokenizer in multi-term queries. + */ + public static PreConfiguredTokenizer singleton(String name, Supplier create, + @Nullable Supplier multiTermComponent) { + return new PreConfiguredTokenizer(name, CachingStrategy.ONE, version -> create.get(), + multiTermComponent == null ? null : version -> multiTermComponent.get()); + } + + /** + * Create a pre-configured tokenizer that may vary based on the Lucene version. + * + * @param name the name of the tokenizer in the api + * @param create builds the tokenizer + * @param multiTermComponent null if this tokenizer shouldn't be used for multi-term queries, otherwise a supplier for the + * {@link TokenFilterFactory} that stands in for this tokenizer in multi-term queries. + */ + public static PreConfiguredTokenizer luceneVersion(String name, Function create, + @Nullable Function multiTermComponent) { + return new PreConfiguredTokenizer(name, CachingStrategy.LUCENE, version -> create.apply(version.luceneVersion), + multiTermComponent == null ? null : version -> multiTermComponent.apply(version.luceneVersion)); + } + + /** + * Create a pre-configured tokenizer that may vary based on the Elasticsearch version. + * + * @param name the name of the tokenizer in the api + * @param create builds the tokenizer + * @param multiTermComponent null if this tokenizer shouldn't be used for multi-term queries, otherwise a supplier for the + * {@link TokenFilterFactory} that stands in for this tokenizer in multi-term queries. + */ + public static PreConfiguredTokenizer elasticsearchVersion(String name, Function create, + @Nullable Function multiTermComponent) { + return new PreConfiguredTokenizer(name, CachingStrategy.ELASTICSEARCH, create, multiTermComponent); + } + + private final Function create; + private final Function multiTermComponent; + + private PreConfiguredTokenizer(String name, PreBuiltCacheFactory.CachingStrategy cache, Function create, + @Nullable Function multiTermComponent) { + super(name, cache); + this.create = create; + this.multiTermComponent = multiTermComponent; + } + + /** + * Does this tokenizer has an equivalent component for analyzing multi-term queries? + */ + public boolean hasMultiTermComponent() { + return multiTermComponent != null; + } + + private interface MultiTermAwareTokenizerFactory extends TokenizerFactory, MultiTermAwareComponent {} + + @Override + protected TokenizerFactory create(Version version) { + if (multiTermComponent != null) { + return new MultiTermAwareTokenizerFactory() { + @Override + public String name() { + return getName(); + } + + @Override + public Tokenizer create() { + return create.apply(version); + } + + @Override + public Object getMultiTermComponent() { + return multiTermComponent.apply(version); + } + }; + } else { + return new TokenizerFactory() { + @Override + public String name() { + return getName(); + } + + @Override + public Tokenizer create() { + return create.apply(version); + } + }; + } + } +} diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index d49edb33eb3..4dd146599c9 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -104,6 +104,7 @@ import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory; import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory; import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; +import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.ReverseTokenFilterFactory; import org.elasticsearch.index.analysis.RomanianAnalyzerProvider; import org.elasticsearch.index.analysis.RussianAnalyzerProvider; @@ -141,7 +142,6 @@ import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider; import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; -import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; import java.io.IOException; @@ -178,9 +178,10 @@ public final class AnalysisModule { NamedRegistry>> normalizers = setupNormalizers(plugins); Map preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins); + Map preConfiguredTokenizers = setupPreConfiguredTokenizers(plugins); analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers - .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredTokenFilters); + .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredTokenFilters, preConfiguredTokenizers); } HunspellService getHunspellService() { @@ -287,6 +288,37 @@ public final class AnalysisModule { return unmodifiableMap(preConfiguredTokenFilters.getRegistry()); } + static Map setupPreConfiguredTokenizers(List plugins) { + NamedRegistry preConfiguredTokenizers = new NamedRegistry<>("pre-configured tokenizer"); + + // Temporary shim to register old style pre-configured tokenizers + for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) { + String name = tokenizer.name().toLowerCase(Locale.ROOT); + PreConfiguredTokenizer preConfigured; + switch (tokenizer.getCachingStrategy()) { + case ONE: + preConfigured = PreConfiguredTokenizer.singleton(name, + () -> tokenizer.create(Version.CURRENT), null); + break; + default: + throw new UnsupportedOperationException( + "Caching strategy unsupported by temporary shim [" + tokenizer + "]"); + } + preConfiguredTokenizers.register(name, preConfigured); + } + // Temporary shim for aliases. TODO deprecate after they are moved + preConfiguredTokenizers.register("nGram", preConfiguredTokenizers.getRegistry().get("ngram")); + preConfiguredTokenizers.register("edgeNGram", preConfiguredTokenizers.getRegistry().get("edge_ngram")); + preConfiguredTokenizers.register("PathHierarchy", preConfiguredTokenizers.getRegistry().get("path_hierarchy")); + + for (AnalysisPlugin plugin: plugins) { + for (PreConfiguredTokenizer tokenizer : plugin.getPreConfiguredTokenizers()) { + preConfiguredTokenizers.register(tokenizer.getName(), tokenizer); + } + } + return unmodifiableMap(preConfiguredTokenizers.getRegistry()); + } + private NamedRegistry> setupTokenizers(List plugins) { NamedRegistry> tokenizers = new NamedRegistry<>("tokenizer"); tokenizers.register("standard", StandardTokenizerFactory::new); diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenizers.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenizers.java index a9869b56bc3..52e7ff6c9c4 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenizers.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenizers.java @@ -21,7 +21,6 @@ package org.elasticsearch.indices.analysis; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; -import org.apache.lucene.analysis.core.LowerCaseTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenizer; @@ -33,6 +32,7 @@ import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer; import org.apache.lucene.analysis.th.ThaiTokenizer; import org.elasticsearch.Version; import org.elasticsearch.common.regex.Regex; +import org.elasticsearch.index.analysis.CustomNormalizerProvider; import org.elasticsearch.index.analysis.MultiTermAwareComponent; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; @@ -42,21 +42,21 @@ import java.util.Locale; public enum PreBuiltTokenizers { - STANDARD(CachingStrategy.LUCENE) { + STANDARD(CachingStrategy.ONE) { @Override protected Tokenizer create(Version version) { return new StandardTokenizer(); } }, - CLASSIC(CachingStrategy.LUCENE) { + CLASSIC(CachingStrategy.ONE) { @Override protected Tokenizer create(Version version) { return new ClassicTokenizer(); } }, - UAX_URL_EMAIL(CachingStrategy.LUCENE) { + UAX_URL_EMAIL(CachingStrategy.ONE) { @Override protected Tokenizer create(Version version) { return new UAX29URLEmailTokenizer(); @@ -77,39 +77,28 @@ public enum PreBuiltTokenizers { } }, - LETTER(CachingStrategy.LUCENE) { + LETTER(CachingStrategy.ONE) { @Override protected Tokenizer create(Version version) { return new LetterTokenizer(); } }, - LOWERCASE(CachingStrategy.LUCENE) { - @Override - protected Tokenizer create(Version version) { - return new LowerCaseTokenizer(); - } - @Override - protected TokenFilterFactory getMultiTermComponent(Version version) { - return PreBuiltTokenFilters.LOWERCASE.getTokenFilterFactory(version); - } - }, - - WHITESPACE(CachingStrategy.LUCENE) { + WHITESPACE(CachingStrategy.ONE) { @Override protected Tokenizer create(Version version) { return new WhitespaceTokenizer(); } }, - NGRAM(CachingStrategy.LUCENE) { + NGRAM(CachingStrategy.ONE) { @Override protected Tokenizer create(Version version) { return new NGramTokenizer(); } }, - EDGE_NGRAM(CachingStrategy.LUCENE) { + EDGE_NGRAM(CachingStrategy.ONE) { @Override protected Tokenizer create(Version version) { return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); @@ -139,64 +128,60 @@ public enum PreBuiltTokenizers { } protected final PreBuiltCacheFactory.PreBuiltCache cache; - + private final CachingStrategy cachingStrategy; PreBuiltTokenizers(CachingStrategy cachingStrategy) { + this.cachingStrategy = cachingStrategy; cache = PreBuiltCacheFactory.getCache(cachingStrategy); } + public CachingStrategy getCachingStrategy() { + return cachingStrategy; + } + private interface MultiTermAwareTokenizerFactory extends TokenizerFactory, MultiTermAwareComponent {} - public synchronized TokenizerFactory getTokenizerFactory(final Version version) { - TokenizerFactory tokenizerFactory = cache.get(version); - if (tokenizerFactory == null) { - final String finalName = name().toLowerCase(Locale.ROOT); - if (getMultiTermComponent(version) != null) { - tokenizerFactory = new MultiTermAwareTokenizerFactory() { - @Override - public String name() { - return finalName; - } - - @Override - public Tokenizer create() { - return PreBuiltTokenizers.this.create(version); - } - - @Override - public Object getMultiTermComponent() { - return PreBuiltTokenizers.this.getMultiTermComponent(version); - } - }; - } else { - tokenizerFactory = new TokenizerFactory() { - @Override - public String name() { - return finalName; - } - - @Override - public Tokenizer create() { - return PreBuiltTokenizers.this.create(version); - } - }; - } - cache.put(version, tokenizerFactory); - } - - return tokenizerFactory; - } - /** - * Get a pre built Tokenizer by its name or fallback to the default one - * @param name Tokenizer name - * @param defaultTokenizer default Tokenizer if name not found + * Old style resolution for {@link TokenizerFactory}. Exists entirely to keep + * {@link CustomNormalizerProvider#build(java.util.Map, java.util.Map)} working during the migration. */ - public static PreBuiltTokenizers getOrDefault(String name, PreBuiltTokenizers defaultTokenizer) { - try { - return valueOf(name.toUpperCase(Locale.ROOT)); - } catch (IllegalArgumentException e) { - return defaultTokenizer; + public synchronized TokenizerFactory getTokenizerFactory(final Version version) { + TokenizerFactory tokenizerFactory = cache.get(version); + if (tokenizerFactory == null) { + final String finalName = name().toLowerCase(Locale.ROOT); + if (getMultiTermComponent(version) != null) { + tokenizerFactory = new MultiTermAwareTokenizerFactory() { + @Override + public String name() { + return finalName; + } + + @Override + public Tokenizer create() { + return PreBuiltTokenizers.this.create(version); + } + + @Override + public Object getMultiTermComponent() { + return PreBuiltTokenizers.this.getMultiTermComponent(version); + } + }; + } else { + tokenizerFactory = new TokenizerFactory() { + @Override + public String name() { + return finalName; + } + + @Override + public Tokenizer create() { + return PreBuiltTokenizers.this.create(version); + } + }; + } + cache.put(version, tokenizerFactory); + } + + return tokenizerFactory; } - } } diff --git a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java index c248c706f23..99b4117f112 100644 --- a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java +++ b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java @@ -22,24 +22,21 @@ package org.elasticsearch.plugins; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.elasticsearch.Version; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.CharFilterFactory; +import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; -import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; import java.io.IOException; import java.util.List; import java.util.Map; -import java.util.function.BiFunction; import static java.util.Collections.emptyList; import static java.util.Collections.emptyMap; @@ -95,12 +92,19 @@ public interface AnalysisPlugin { } /** - * Override to add additional pre-configured token filters. + * Override to add additional pre-configured {@link TokenFilter}s. */ default List getPreConfiguredTokenFilters() { return emptyList(); } + /** + * Override to add additional pre-configured {@link Tokenizer}. + */ + default List getPreConfiguredTokenizers() { + return emptyList(); + } + /** * Override to add additional hunspell {@link org.apache.lucene.analysis.hunspell.Dictionary}s. */ diff --git a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java index b1a9bc75019..67f3a17c510 100644 --- a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java +++ b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java @@ -123,7 +123,8 @@ public class IndexModuleTests extends ESTestCase { indexSettings = IndexSettingsModule.newIndexSettings("foo", settings); index = indexSettings.getIndex(); environment = new Environment(settings); - emptyAnalysisRegistry = new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); + emptyAnalysisRegistry = new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), + emptyMap(), emptyMap()); threadPool = new TestThreadPool("test"); circuitBreakerService = new NoneCircuitBreakerService(); bigArrays = new BigArrays(settings, circuitBreakerService); diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java index 03329667627..57ef842072a 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java @@ -41,7 +41,6 @@ import org.elasticsearch.test.VersionUtils; import java.io.IOException; import java.util.Map; -import java.util.concurrent.atomic.AtomicBoolean; import static java.util.Collections.emptyMap; import static java.util.Collections.singletonList; @@ -50,25 +49,29 @@ import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; public class AnalysisRegistryTests extends ESTestCase { - - private Environment emptyEnvironment; private AnalysisRegistry emptyRegistry; - private IndexSettings emptyIndexSettingsOfCurrentVersion; private static AnalyzerProvider analyzerProvider(final String name) { return new PreBuiltAnalyzerProvider(name, AnalyzerScope.INDEX, new EnglishAnalyzer()); } + private static AnalysisRegistry emptyAnalysisRegistry(Settings settings) { + return new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), + emptyMap()); + } + + private static IndexSettings indexSettingsOfCurrentVersion(Settings.Builder settings) { + return IndexSettingsModule.newIndexSettings("index", settings + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .build()); + } + @Override public void setUp() throws Exception { super.setUp(); - emptyEnvironment = new Environment(Settings.builder() + emptyRegistry = emptyAnalysisRegistry(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build()); - emptyRegistry = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); - emptyIndexSettingsOfCurrentVersion = IndexSettingsModule.newIndexSettings("index", Settings.builder() - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .build()); } public void testDefaultAnalyzers() throws IOException { @@ -191,12 +194,8 @@ public class AnalysisRegistryTests extends ESTestCase { Settings indexSettings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); - IndexAnalyzers indexAnalyzers = - new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) - .build(idxSettings); - IndexAnalyzers otherIndexAnalyzers = - new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) - .build(idxSettings); + IndexAnalyzers indexAnalyzers = emptyAnalysisRegistry(settings).build(idxSettings); + IndexAnalyzers otherIndexAnalyzers = emptyAnalysisRegistry(settings).build(idxSettings); final int numIters = randomIntBetween(5, 20); for (int i = 0; i < numIters; i++) { PreBuiltAnalyzers preBuiltAnalyzers = RandomPicks.randomFrom(random(), PreBuiltAnalyzers.values()); @@ -204,22 +203,6 @@ public class AnalysisRegistryTests extends ESTestCase { } } - public void testPreConfiguredTokenFiltersAreCached() throws IOException { - AtomicBoolean built = new AtomicBoolean(false); - PreConfiguredTokenFilter assertsBuiltOnce = PreConfiguredTokenFilter.singleton("asserts_built_once", false, tokenStream -> { - if (false == built.compareAndSet(false, true)) { - fail("Attempted to build the token filter twice when it should have been cached"); - } - return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET); - }); - try (AnalysisRegistry registryWithPreBuiltTokenFilter = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), - emptyMap(), emptyMap(), singletonMap("asserts_built_once", assertsBuiltOnce))) { - IndexAnalyzers indexAnalyzers = registryWithPreBuiltTokenFilter.build(emptyIndexSettingsOfCurrentVersion); - IndexAnalyzers otherIndexAnalyzers = registryWithPreBuiltTokenFilter.build(emptyIndexSettingsOfCurrentVersion); - assertSame(indexAnalyzers.get("asserts_built_once"), otherIndexAnalyzers.get("asserts_built_once")); - } - } - public void testNoTypeOrTokenizerErrorMessage() throws IOException { Version version = VersionUtils.randomVersion(random()); Settings settings = Settings @@ -231,14 +214,12 @@ public class AnalysisRegistryTests extends ESTestCase { .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> - new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) - .build(idxSettings)); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> emptyAnalysisRegistry(settings).build(idxSettings)); assertThat(e.getMessage(), equalTo("analyzer [test_analyzer] must specify either an analyzer type, or a tokenizer")); } public void testCloseIndexAnalyzersMultipleTimes() throws IOException { - IndexAnalyzers indexAnalyzers = emptyRegistry.build(emptyIndexSettingsOfCurrentVersion); + IndexAnalyzers indexAnalyzers = emptyRegistry.build(indexSettingsOfCurrentVersion(Settings.builder())); indexAnalyzers.close(); indexAnalyzers.close(); } diff --git a/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java b/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java index 298c8938dd2..f94c0c8fe74 100644 --- a/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java +++ b/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java @@ -23,7 +23,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.hunspell.Dictionary; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -37,12 +36,12 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.Analysis; import org.elasticsearch.index.analysis.AnalysisRegistry; -import org.elasticsearch.index.analysis.AnalysisTestsHelper; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.CustomAnalyzer; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; +import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.StandardTokenizerFactory; import org.elasticsearch.index.analysis.StopTokenFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; @@ -57,7 +56,6 @@ import org.hamcrest.MatcherAssert; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStream; -import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -164,18 +162,6 @@ public class AnalysisModuleTests extends ESTestCase { assertEquals(org.apache.lucene.util.Version.fromBits(3,6,0), indexAnalyzers.get("custom7").analyzer().getVersion()); } - private void assertTokenFilter(String name, Class clazz) throws IOException { - Settings settings = Settings.builder() - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); - TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get(name); - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader("foo bar")); - TokenStream stream = tokenFilter.create(tokenizer); - assertThat(stream, instanceOf(clazz)); - } - private void testSimpleConfiguration(Settings settings) throws IOException { IndexAnalyzers indexAnalyzers = getIndexAnalyzers(settings); Analyzer analyzer = indexAnalyzers.get("custom1").analyzer(); @@ -269,27 +255,6 @@ public class AnalysisModuleTests extends ESTestCase { * and that do not vary based on version at all. */ public void testPluginPreConfiguredTokenFilters() throws IOException { - // Simple token filter that appends text to the term - final class AppendTokenFilter extends TokenFilter { - private final CharTermAttribute term = addAttribute(CharTermAttribute.class); - private final char[] appendMe; - - protected AppendTokenFilter(TokenStream input, String appendMe) { - super(input); - this.appendMe = appendMe.toCharArray(); - } - - @Override - public boolean incrementToken() throws IOException { - if (false == input.incrementToken()) { - return false; - } - term.resizeBuffer(term.length() + appendMe.length); - System.arraycopy(appendMe, 0, term.buffer(), term.length(), appendMe.length); - term.setLength(term.length() + appendMe.length); - return true; - } - } boolean noVersionSupportsMultiTerm = randomBoolean(); boolean luceneVersionSupportsMultiTerm = randomBoolean(); boolean elasticsearchVersionSupportsMultiTerm = randomBoolean(); @@ -329,6 +294,82 @@ public class AnalysisModuleTests extends ESTestCase { analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString()); } + /** + * Tests that plugins can register pre-configured token filters that vary in behavior based on Elasticsearch version, Lucene version, + * and that do not vary based on version at all. + */ + public void testPluginPreConfiguredTokenizers() throws IOException { + boolean noVersionSupportsMultiTerm = randomBoolean(); + boolean luceneVersionSupportsMultiTerm = randomBoolean(); + boolean elasticsearchVersionSupportsMultiTerm = randomBoolean(); + + // Simple tokenizer that always spits out a single token with some preconfigured characters + final class FixedTokenizer extends Tokenizer { + private final CharTermAttribute term = addAttribute(CharTermAttribute.class); + private final char[] chars; + private boolean read = false; + + protected FixedTokenizer(String chars) { + this.chars = chars.toCharArray(); + } + + @Override + public boolean incrementToken() throws IOException { + if (read) { + return false; + } + clearAttributes(); + read = true; + term.resizeBuffer(chars.length); + System.arraycopy(chars, 0, term.buffer(), 0, chars.length); + term.setLength(chars.length); + return true; + } + + @Override + public void reset() throws IOException { + super.reset(); + read = false; + } + } + AnalysisRegistry registry = new AnalysisModule(new Environment(emptyNodeSettings), singletonList(new AnalysisPlugin() { + @Override + public List getPreConfiguredTokenizers() { + return Arrays.asList( + PreConfiguredTokenizer.singleton("no_version", () -> new FixedTokenizer("no_version"), + noVersionSupportsMultiTerm ? () -> AppendTokenFilter.factoryForSuffix("no_version") : null), + PreConfiguredTokenizer.luceneVersion("lucene_version", + luceneVersion -> new FixedTokenizer(luceneVersion.toString()), + luceneVersionSupportsMultiTerm ? + luceneVersion -> AppendTokenFilter.factoryForSuffix(luceneVersion.toString()) : null), + PreConfiguredTokenizer.elasticsearchVersion("elasticsearch_version", + esVersion -> new FixedTokenizer(esVersion.toString()), + elasticsearchVersionSupportsMultiTerm ? + esVersion -> AppendTokenFilter.factoryForSuffix(esVersion.toString()) : null) + ); + } + })).getAnalysisRegistry(); + + Version version = VersionUtils.randomVersion(random()); + IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder() + .put("index.analysis.analyzer.no_version.tokenizer", "no_version") + .put("index.analysis.analyzer.lucene_version.tokenizer", "lucene_version") + .put("index.analysis.analyzer.elasticsearch_version.tokenizer", "elasticsearch_version") + .put(IndexMetaData.SETTING_VERSION_CREATED, version) + .build()); + assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] {"no_version"}); + assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] {version.luceneVersion.toString()}); + assertTokenStreamContents(analyzers.get("elasticsearch_version").tokenStream("", "test"), new String[] {version.toString()}); + + // These are current broken by https://github.com/elastic/elasticsearch/issues/24752 +// assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""), +// analyzers.get("no_version").normalize("", "test").utf8ToString()); +// assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""), +// analyzers.get("lucene_version").normalize("", "test").utf8ToString()); +// assertEquals("test" + (elasticsearchVersionSupportsMultiTerm ? version.toString() : ""), +// analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString()); + } + public void testRegisterHunspellDictionary() throws Exception { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) @@ -349,4 +390,41 @@ public class AnalysisModuleTests extends ESTestCase { })); assertSame(dictionary, module.getHunspellService().getDictionary("foo")); } + + // Simple token filter that appends text to the term + private static class AppendTokenFilter extends TokenFilter { + public static TokenFilterFactory factoryForSuffix(String suffix) { + return new TokenFilterFactory() { + @Override + public String name() { + return suffix; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new AppendTokenFilter(tokenStream, suffix); + } + }; + } + + private final CharTermAttribute term = addAttribute(CharTermAttribute.class); + private final char[] appendMe; + + protected AppendTokenFilter(TokenStream input, String appendMe) { + super(input); + this.appendMe = appendMe.toCharArray(); + } + + @Override + public boolean incrementToken() throws IOException { + if (false == input.incrementToken()) { + return false; + } + term.resizeBuffer(term.length() + appendMe.length); + System.arraycopy(appendMe, 0, term.buffer(), term.length(), appendMe.length); + term.setLength(term.length() + appendMe.length); + return true; + } + } + } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 290b09edc1d..fcca4f7eddf 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -20,7 +20,9 @@ package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.apache.lucene.analysis.ar.ArabicStemFilter; import org.apache.lucene.analysis.br.BrazilianStemFilter; @@ -29,6 +31,7 @@ import org.apache.lucene.analysis.cjk.CJKWidthFilter; import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.core.DecimalDigitFilter; +import org.apache.lucene.analysis.core.LowerCaseTokenizer; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.UpperCaseFilter; import org.apache.lucene.analysis.cz.CzechStemFilter; @@ -66,6 +69,7 @@ import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; +import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.plugins.AnalysisPlugin; @@ -174,4 +178,21 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); return filters; } + + @Override + public List getPreConfiguredTokenizers() { + List tokenizers = new ArrayList<>(); + tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", LowerCaseTokenizer::new, () -> new TokenFilterFactory() { + @Override + public String name() { + return "lowercase"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new LowerCaseFilter(tokenStream); + } + })); + return tokenizers; + } } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index d2505406457..3ce7fd1d301 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -117,6 +117,13 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase { return filters; } + @Override + protected Map> getPreConfiguredTokenizers() { + Map> filters = new TreeMap<>(super.getPreConfiguredTokenFilters()); + + return filters; + } + /** * Fails if a tokenizer is marked in the superclass with {@link MovedToAnalysisCommon} but * hasn't been marked in this class with its proper factory. diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 040d2fb2dc6..f70a3a5ef99 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -66,6 +66,7 @@ import org.elasticsearch.index.analysis.PatternTokenizerFactory; import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory; import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; +import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.ReverseTokenFilterFactory; import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory; import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory; @@ -95,6 +96,7 @@ import java.util.Collection; import java.util.EnumMap; import java.util.HashMap; import java.util.HashSet; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Set; @@ -103,6 +105,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.util.Collections.singletonList; +import static org.hamcrest.Matchers.typeCompatibleWith; /** * Alerts us if new analysis components are added to Lucene, so we don't miss them. @@ -148,26 +151,6 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { .put("simplepatternsplit", Void.class) .immutableMap(); - static final Map> PREBUILT_TOKENIZERS; - static { - PREBUILT_TOKENIZERS = new EnumMap<>(PreBuiltTokenizers.class); - for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) { - Class luceneFactoryClazz; - switch (tokenizer) { - case UAX_URL_EMAIL: - luceneFactoryClazz = org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class; - break; - case PATH_HIERARCHY: - luceneFactoryClazz = Void.class; - break; - default: - luceneFactoryClazz = org.apache.lucene.analysis.util.TokenizerFactory.lookupClass( - toCamelCase(tokenizer.getTokenizerFactory(Version.CURRENT).name())); - } - PREBUILT_TOKENIZERS.put(tokenizer, luceneFactoryClazz); - } - } - static final Map> KNOWN_TOKENFILTERS = new MapBuilder>() // exposed in ES .put("apostrophe", ApostropheFilterFactory.class) @@ -319,22 +302,26 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { this.plugin = Objects.requireNonNull(plugin, "plugin is required. use an empty plugin for core"); } - protected Map> getTokenizers() { - return KNOWN_TOKENIZERS; + protected Map> getCharFilters() { + return KNOWN_CHARFILTERS; } protected Map> getTokenFilters() { return KNOWN_TOKENFILTERS; } + protected Map> getTokenizers() { + return KNOWN_TOKENIZERS; + } + /** * Map containing pre-configured token filters that should be available * after installing this plugin. The map is from the name of the token * filter to the class of the Lucene {@link TokenFilterFactory} that it - * is emulating. If the Lucene filter factory is {@code null} then the - * test will look it up for you from the name. If there is no Lucene - * {@linkplain TokenFilterFactory} then the right hand side should - * be {@link Void}. + * is emulating. If the Lucene {@linkplain TokenFilterFactory} is + * {@code null} then the test will look it up for you from the name. If + * there is no Lucene {@linkplain TokenFilterFactory} then the right + * hand side should be {@link Void}. */ protected Map> getPreConfiguredTokenFilters() { Map> filters = new HashMap<>(); @@ -343,8 +330,33 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { return filters; } - protected Map> getCharFilters() { - return KNOWN_CHARFILTERS; + /** + * Map containing pre-configured tokenizers that should be available + * after installing this plugin. The map is from the name of the token + * filter to the class of the Lucene {@link TokenizerFactory} that it + * is emulating. If the Lucene {@linkplain TokenizerFactory} is + * {@code null} then the test will look it up for you from the name. + * If there is no Lucene {@linkplain TokenizerFactory} then the right + * hand side should be {@link Void}. + */ + protected Map> getPreConfiguredTokenizers() { + Map> tokenizers = new HashMap<>(); + // TODO drop this temporary shim when all the old style tokenizers have been migrated to new style + for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) { + final Class luceneFactoryClazz; + switch (tokenizer) { + case UAX_URL_EMAIL: + luceneFactoryClazz = org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class; + break; + case PATH_HIERARCHY: + luceneFactoryClazz = Void.class; + break; + default: + luceneFactoryClazz = null; + } + tokenizers.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClazz); + } + return tokenizers; } public void testTokenizers() { @@ -421,21 +433,8 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { Collection expected = new HashSet<>(); Collection actual = new HashSet<>(); - for (Map.Entry> entry : PREBUILT_TOKENIZERS.entrySet()) { - PreBuiltTokenizers tokenizer = entry.getKey(); - Class luceneFactory = entry.getValue(); - if (luceneFactory == Void.class) { - continue; - } - assertTrue(TokenizerFactory.class.isAssignableFrom(luceneFactory)); - if (tokenizer.getTokenizerFactory(Version.CURRENT) instanceof MultiTermAwareComponent) { - actual.add(tokenizer); - } - if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) { - expected.add(tokenizer); - } - } - Map preBuiltTokenFilters = AnalysisModule.setupPreConfiguredTokenFilters(singletonList(plugin)); + Map preConfiguredTokenFilters = + AnalysisModule.setupPreConfiguredTokenFilters(singletonList(plugin)); for (Map.Entry> entry : getPreConfiguredTokenFilters().entrySet()) { String name = entry.getKey(); Class luceneFactory = entry.getValue(); @@ -445,8 +444,8 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { if (luceneFactory == null) { luceneFactory = TokenFilterFactory.lookupClass(toCamelCase(name)); } - assertTrue(TokenFilterFactory.class.isAssignableFrom(luceneFactory)); - PreConfiguredTokenFilter filter = preBuiltTokenFilters.get(name); + assertThat(luceneFactory, typeCompatibleWith(TokenFilterFactory.class)); + PreConfiguredTokenFilter filter = preConfiguredTokenFilters.get(name); assertNotNull("test claims pre built token filter [" + name + "] should be available but it wasn't", filter); if (filter.shouldUseFilterForMultitermQueries()) { actual.add("token filter [" + name + "]"); @@ -455,6 +454,25 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { expected.add("token filter [" + name + "]"); } } + Map preConfiguredTokenizers = AnalysisModule.setupPreConfiguredTokenizers(singletonList(plugin)); + for (Map.Entry> entry : getPreConfiguredTokenizers().entrySet()) { + String name = entry.getKey(); + Class luceneFactory = entry.getValue(); + if (luceneFactory == Void.class) { + continue; + } + if (luceneFactory == null) { + luceneFactory = TokenizerFactory.lookupClass(toCamelCase(name)); + } + assertThat(luceneFactory, typeCompatibleWith(TokenizerFactory.class)); + PreConfiguredTokenizer tokenizer = preConfiguredTokenizers.get(name); + if (tokenizer.hasMultiTermComponent()) { + actual.add(tokenizer); + } + if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) { + expected.add(tokenizer); + } + } for (Map.Entry> entry : PREBUILT_CHARFILTERS.entrySet()) { PreBuiltCharFilters charFilter = entry.getKey(); Class luceneFactory = entry.getValue();