Allow plugins to register pre-configured tokenizers (#24751)
Allows plugins to register pre-configured tokenizers. Much of the decisions are the same as those in #24223, #24572, and #24223. This only migrates the lowercase tokenizer but I figure that is a good start because it proves out the features.
This commit is contained in:
parent
ae73670257
commit
b9ea579633
|
@ -74,14 +74,15 @@ public final class AnalysisRegistry implements Closeable {
|
||||||
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers,
|
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers,
|
||||||
Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers,
|
Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers,
|
||||||
Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers,
|
Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers,
|
||||||
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters) {
|
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters,
|
||||||
|
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers) {
|
||||||
this.environment = environment;
|
this.environment = environment;
|
||||||
this.charFilters = unmodifiableMap(charFilters);
|
this.charFilters = unmodifiableMap(charFilters);
|
||||||
this.tokenFilters = unmodifiableMap(tokenFilters);
|
this.tokenFilters = unmodifiableMap(tokenFilters);
|
||||||
this.tokenizers = unmodifiableMap(tokenizers);
|
this.tokenizers = unmodifiableMap(tokenizers);
|
||||||
this.analyzers = unmodifiableMap(analyzers);
|
this.analyzers = unmodifiableMap(analyzers);
|
||||||
this.normalizers = unmodifiableMap(normalizers);
|
this.normalizers = unmodifiableMap(normalizers);
|
||||||
prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters);
|
prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters, preConfiguredTokenizers);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -169,12 +170,12 @@ public final class AnalysisRegistry implements Closeable {
|
||||||
*/
|
*/
|
||||||
tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
|
tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
|
||||||
tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));
|
tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));
|
||||||
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.tokenFilterFactories);
|
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
|
public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
|
||||||
final Map<String, Settings> tokenizersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_TOKENIZER);
|
final Map<String, Settings> tokenizersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_TOKENIZER);
|
||||||
return buildMapping(Component.TOKENIZER, indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.tokenizerFactories);
|
return buildMapping(Component.TOKENIZER, indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.preConfiguredTokenizers);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, CharFilterFactory> buildCharFilterFactories(IndexSettings indexSettings) throws IOException {
|
public Map<String, CharFilterFactory> buildCharFilterFactories(IndexSettings indexSettings) throws IOException {
|
||||||
|
@ -394,31 +395,22 @@ public final class AnalysisRegistry implements Closeable {
|
||||||
private static class PrebuiltAnalysis implements Closeable {
|
private static class PrebuiltAnalysis implements Closeable {
|
||||||
|
|
||||||
final Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<?>>> analyzerProviderFactories;
|
final Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<?>>> analyzerProviderFactories;
|
||||||
final Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> tokenizerFactories;
|
final Map<String, ? extends AnalysisProvider<TokenFilterFactory>> preConfiguredTokenFilters;
|
||||||
final Map<String, ? extends AnalysisProvider<TokenFilterFactory>> tokenFilterFactories;
|
final Map<String, ? extends AnalysisProvider<TokenizerFactory>> preConfiguredTokenizers;
|
||||||
final Map<String, AnalysisModule.AnalysisProvider<CharFilterFactory>> charFilterFactories;
|
final Map<String, AnalysisModule.AnalysisProvider<CharFilterFactory>> charFilterFactories;
|
||||||
|
|
||||||
private PrebuiltAnalysis(Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters) {
|
private PrebuiltAnalysis(
|
||||||
|
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters,
|
||||||
|
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers) {
|
||||||
Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = new HashMap<>();
|
Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = new HashMap<>();
|
||||||
Map<String, PreBuiltTokenizerFactoryFactory> tokenizerFactories = new HashMap<>();
|
|
||||||
Map<String, PreBuiltCharFilterFactoryFactory> charFilterFactories = new HashMap<>();
|
Map<String, PreBuiltCharFilterFactoryFactory> charFilterFactories = new HashMap<>();
|
||||||
|
|
||||||
// Analyzers
|
// Analyzers
|
||||||
for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) {
|
for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) {
|
||||||
String name = preBuiltAnalyzerEnum.name().toLowerCase(Locale.ROOT);
|
String name = preBuiltAnalyzerEnum.name().toLowerCase(Locale.ROOT);
|
||||||
analyzerProviderFactories.put(name, new PreBuiltAnalyzerProviderFactory(name, AnalyzerScope.INDICES, preBuiltAnalyzerEnum.getAnalyzer(Version.CURRENT)));
|
analyzerProviderFactories.put(name, new PreBuiltAnalyzerProviderFactory(name, AnalyzerScope.INDICES, preBuiltAnalyzerEnum.getAnalyzer(Version.CURRENT)));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tokenizers
|
|
||||||
for (PreBuiltTokenizers preBuiltTokenizer : PreBuiltTokenizers.values()) {
|
|
||||||
String name = preBuiltTokenizer.name().toLowerCase(Locale.ROOT);
|
|
||||||
tokenizerFactories.put(name, new PreBuiltTokenizerFactoryFactory(preBuiltTokenizer.getTokenizerFactory(Version.CURRENT)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tokenizer aliases
|
|
||||||
tokenizerFactories.put("nGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.NGRAM.getTokenizerFactory(Version.CURRENT)));
|
|
||||||
tokenizerFactories.put("edgeNGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.EDGE_NGRAM.getTokenizerFactory(Version.CURRENT)));
|
|
||||||
tokenizerFactories.put("PathHierarchy", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.PATH_HIERARCHY.getTokenizerFactory(Version.CURRENT)));
|
|
||||||
|
|
||||||
// Char Filters
|
// Char Filters
|
||||||
for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) {
|
for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) {
|
||||||
String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT);
|
String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT);
|
||||||
|
@ -429,8 +421,8 @@ public final class AnalysisRegistry implements Closeable {
|
||||||
|
|
||||||
this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories);
|
this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories);
|
||||||
this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories);
|
this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories);
|
||||||
this.tokenizerFactories = Collections.unmodifiableMap(tokenizerFactories);
|
this.preConfiguredTokenFilters = preConfiguredTokenFilters;
|
||||||
tokenFilterFactories = preConfiguredTokenFilters;
|
this.preConfiguredTokenizers = preConfiguredTokenizers;
|
||||||
}
|
}
|
||||||
|
|
||||||
public AnalysisModule.AnalysisProvider<CharFilterFactory> getCharFilterFactory(String name) {
|
public AnalysisModule.AnalysisProvider<CharFilterFactory> getCharFilterFactory(String name) {
|
||||||
|
@ -438,11 +430,11 @@ public final class AnalysisRegistry implements Closeable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public AnalysisModule.AnalysisProvider<TokenFilterFactory> getTokenFilterFactory(String name) {
|
public AnalysisModule.AnalysisProvider<TokenFilterFactory> getTokenFilterFactory(String name) {
|
||||||
return tokenFilterFactories.get(name);
|
return preConfiguredTokenFilters.get(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
public AnalysisModule.AnalysisProvider<TokenizerFactory> getTokenizerFactory(String name) {
|
public AnalysisModule.AnalysisProvider<TokenizerFactory> getTokenizerFactory(String name) {
|
||||||
return tokenizerFactories.get(name);
|
return preConfiguredTokenizers.get(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
public AnalysisModule.AnalysisProvider<AnalyzerProvider<?>> getAnalyzerProvider(String name) {
|
public AnalysisModule.AnalysisProvider<AnalyzerProvider<?>> getAnalyzerProvider(String name) {
|
||||||
|
|
|
@ -1,50 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to Elasticsearch under one or more contributor
|
|
||||||
* license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright
|
|
||||||
* ownership. Elasticsearch licenses this file to you under
|
|
||||||
* the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
* not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing,
|
|
||||||
* software distributed under the License is distributed on an
|
|
||||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
* KIND, either express or implied. See the License for the
|
|
||||||
* specific language governing permissions and limitations
|
|
||||||
* under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
|
||||||
|
|
||||||
import org.elasticsearch.Version;
|
|
||||||
import org.elasticsearch.common.settings.Settings;
|
|
||||||
import org.elasticsearch.env.Environment;
|
|
||||||
import org.elasticsearch.index.IndexSettings;
|
|
||||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
|
||||||
import org.elasticsearch.indices.analysis.PreBuiltTokenizers;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
public class PreBuiltTokenizerFactoryFactory implements AnalysisModule.AnalysisProvider<TokenizerFactory> {
|
|
||||||
|
|
||||||
private final TokenizerFactory tokenizerFactory;
|
|
||||||
|
|
||||||
public PreBuiltTokenizerFactoryFactory(TokenizerFactory tokenizerFactory) {
|
|
||||||
this.tokenizerFactory = tokenizerFactory;
|
|
||||||
}
|
|
||||||
|
|
||||||
public TokenizerFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
|
|
||||||
Version indexVersion = Version.indexCreated(settings);
|
|
||||||
if (!Version.CURRENT.equals(indexVersion)) {
|
|
||||||
PreBuiltTokenizers preBuiltTokenizers = PreBuiltTokenizers.getOrDefault(name, null);
|
|
||||||
if (preBuiltTokenizers != null) {
|
|
||||||
return preBuiltTokenizers.getTokenizerFactory(indexVersion);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return tokenizerFactory;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,64 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.elasticsearch.Version;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||||
|
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shared implementation for pre-configured analysis components.
|
||||||
|
*/
|
||||||
|
public abstract class PreConfiguredAnalysisComponent<T> implements AnalysisModule.AnalysisProvider<T> {
|
||||||
|
private final String name;
|
||||||
|
private final PreBuiltCacheFactory.PreBuiltCache<T> cache;
|
||||||
|
|
||||||
|
protected PreConfiguredAnalysisComponent(String name, PreBuiltCacheFactory.CachingStrategy cache) {
|
||||||
|
this.name = name;
|
||||||
|
this.cache = PreBuiltCacheFactory.getCache(cache);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public T get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
|
||||||
|
Version versionCreated = Version.indexCreated(settings);
|
||||||
|
synchronized (this) {
|
||||||
|
T factory = cache.get(versionCreated);
|
||||||
|
if (factory == null) {
|
||||||
|
factory = create(versionCreated);
|
||||||
|
cache.put(versionCreated, factory);
|
||||||
|
}
|
||||||
|
return factory;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The name of the analysis component in the API.
|
||||||
|
*/
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract T create(Version version);
|
||||||
|
}
|
|
@ -22,21 +22,16 @@ package org.elasticsearch.index.analysis;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
|
||||||
import org.elasticsearch.env.Environment;
|
|
||||||
import org.elasticsearch.index.IndexSettings;
|
|
||||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
|
||||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
|
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
|
||||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.function.BiFunction;
|
import java.util.function.BiFunction;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides pre-configured, shared {@link TokenFilter}s.
|
* Provides pre-configured, shared {@link TokenFilter}s.
|
||||||
*/
|
*/
|
||||||
public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider<TokenFilterFactory> {
|
public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisComponent<TokenFilterFactory> {
|
||||||
/**
|
/**
|
||||||
* Create a pre-configured token filter that may not vary at all.
|
* Create a pre-configured token filter that may not vary at all.
|
||||||
*/
|
*/
|
||||||
|
@ -60,35 +55,19 @@ public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisPr
|
||||||
*/
|
*/
|
||||||
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
|
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
|
||||||
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
|
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
|
||||||
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH,
|
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH, create);
|
||||||
(tokenStream, version) -> create.apply(tokenStream, version));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private final String name;
|
|
||||||
private final boolean useFilterForMultitermQueries;
|
private final boolean useFilterForMultitermQueries;
|
||||||
private final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
|
|
||||||
private final BiFunction<TokenStream, Version, TokenStream> create;
|
private final BiFunction<TokenStream, Version, TokenStream> create;
|
||||||
|
|
||||||
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
|
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
|
||||||
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
|
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
|
||||||
this.name = name;
|
super(name, cache);
|
||||||
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
|
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
|
||||||
this.cache = PreBuiltCacheFactory.getCache(cache);
|
|
||||||
this.create = create;
|
this.create = create;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
|
|
||||||
return getTokenFilterFactory(Version.indexCreated(settings));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The name of the {@link TokenFilter} in the API.
|
|
||||||
*/
|
|
||||||
public String getName() {
|
|
||||||
return name;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Can this {@link TokenFilter} be used in multi-term queries?
|
* Can this {@link TokenFilter} be used in multi-term queries?
|
||||||
*/
|
*/
|
||||||
|
@ -98,14 +77,13 @@ public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisPr
|
||||||
|
|
||||||
private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {}
|
private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {}
|
||||||
|
|
||||||
private synchronized TokenFilterFactory getTokenFilterFactory(final Version version) {
|
@Override
|
||||||
TokenFilterFactory factory = cache.get(version);
|
protected TokenFilterFactory create(Version version) {
|
||||||
if (factory == null) {
|
|
||||||
if (useFilterForMultitermQueries) {
|
if (useFilterForMultitermQueries) {
|
||||||
factory = new MultiTermAwareTokenFilterFactory() {
|
return new MultiTermAwareTokenFilterFactory() {
|
||||||
@Override
|
@Override
|
||||||
public String name() {
|
public String name() {
|
||||||
return name;
|
return getName();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -118,11 +96,11 @@ public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisPr
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
} else {
|
}
|
||||||
factory = new TokenFilterFactory() {
|
return new TokenFilterFactory() {
|
||||||
@Override
|
@Override
|
||||||
public String name() {
|
public String name() {
|
||||||
return name;
|
return getName();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -131,9 +109,4 @@ public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisPr
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
cache.put(version, factory);
|
|
||||||
}
|
|
||||||
|
|
||||||
return factory;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,128 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.elasticsearch.Version;
|
||||||
|
import org.elasticsearch.common.Nullable;
|
||||||
|
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
|
||||||
|
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
||||||
|
|
||||||
|
import java.util.function.Function;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides pre-configured, shared {@link Tokenizer}s.
|
||||||
|
*/
|
||||||
|
public final class PreConfiguredTokenizer extends PreConfiguredAnalysisComponent<TokenizerFactory> {
|
||||||
|
/**
|
||||||
|
* Create a pre-configured tokenizer that may not vary at all.
|
||||||
|
*
|
||||||
|
* @param name the name of the tokenizer in the api
|
||||||
|
* @param create builds the tokenizer
|
||||||
|
* @param multiTermComponent null if this tokenizer shouldn't be used for multi-term queries, otherwise a supplier for the
|
||||||
|
* {@link TokenFilterFactory} that stands in for this tokenizer in multi-term queries.
|
||||||
|
*/
|
||||||
|
public static PreConfiguredTokenizer singleton(String name, Supplier<Tokenizer> create,
|
||||||
|
@Nullable Supplier<TokenFilterFactory> multiTermComponent) {
|
||||||
|
return new PreConfiguredTokenizer(name, CachingStrategy.ONE, version -> create.get(),
|
||||||
|
multiTermComponent == null ? null : version -> multiTermComponent.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a pre-configured tokenizer that may vary based on the Lucene version.
|
||||||
|
*
|
||||||
|
* @param name the name of the tokenizer in the api
|
||||||
|
* @param create builds the tokenizer
|
||||||
|
* @param multiTermComponent null if this tokenizer shouldn't be used for multi-term queries, otherwise a supplier for the
|
||||||
|
* {@link TokenFilterFactory} that stands in for this tokenizer in multi-term queries.
|
||||||
|
*/
|
||||||
|
public static PreConfiguredTokenizer luceneVersion(String name, Function<org.apache.lucene.util.Version, Tokenizer> create,
|
||||||
|
@Nullable Function<org.apache.lucene.util.Version, TokenFilterFactory> multiTermComponent) {
|
||||||
|
return new PreConfiguredTokenizer(name, CachingStrategy.LUCENE, version -> create.apply(version.luceneVersion),
|
||||||
|
multiTermComponent == null ? null : version -> multiTermComponent.apply(version.luceneVersion));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a pre-configured tokenizer that may vary based on the Elasticsearch version.
|
||||||
|
*
|
||||||
|
* @param name the name of the tokenizer in the api
|
||||||
|
* @param create builds the tokenizer
|
||||||
|
* @param multiTermComponent null if this tokenizer shouldn't be used for multi-term queries, otherwise a supplier for the
|
||||||
|
* {@link TokenFilterFactory} that stands in for this tokenizer in multi-term queries.
|
||||||
|
*/
|
||||||
|
public static PreConfiguredTokenizer elasticsearchVersion(String name, Function<org.elasticsearch.Version, Tokenizer> create,
|
||||||
|
@Nullable Function<Version, TokenFilterFactory> multiTermComponent) {
|
||||||
|
return new PreConfiguredTokenizer(name, CachingStrategy.ELASTICSEARCH, create, multiTermComponent);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final Function<Version, Tokenizer> create;
|
||||||
|
private final Function<Version, TokenFilterFactory> multiTermComponent;
|
||||||
|
|
||||||
|
private PreConfiguredTokenizer(String name, PreBuiltCacheFactory.CachingStrategy cache, Function<Version, Tokenizer> create,
|
||||||
|
@Nullable Function<Version, TokenFilterFactory> multiTermComponent) {
|
||||||
|
super(name, cache);
|
||||||
|
this.create = create;
|
||||||
|
this.multiTermComponent = multiTermComponent;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does this tokenizer has an equivalent component for analyzing multi-term queries?
|
||||||
|
*/
|
||||||
|
public boolean hasMultiTermComponent() {
|
||||||
|
return multiTermComponent != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private interface MultiTermAwareTokenizerFactory extends TokenizerFactory, MultiTermAwareComponent {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected TokenizerFactory create(Version version) {
|
||||||
|
if (multiTermComponent != null) {
|
||||||
|
return new MultiTermAwareTokenizerFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return getName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Tokenizer create() {
|
||||||
|
return create.apply(version);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object getMultiTermComponent() {
|
||||||
|
return multiTermComponent.apply(version);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
return new TokenizerFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return getName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Tokenizer create() {
|
||||||
|
return create.apply(version);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -104,6 +104,7 @@ import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory;
|
import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
|
import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||||
|
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
|
||||||
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
|
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
|
import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
|
import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
|
||||||
|
@ -141,7 +142,6 @@ import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
||||||
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
||||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
|
||||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -178,9 +178,10 @@ public final class AnalysisModule {
|
||||||
NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> normalizers = setupNormalizers(plugins);
|
NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> normalizers = setupNormalizers(plugins);
|
||||||
|
|
||||||
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins);
|
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins);
|
||||||
|
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers = setupPreConfiguredTokenizers(plugins);
|
||||||
|
|
||||||
analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers
|
analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers
|
||||||
.getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredTokenFilters);
|
.getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredTokenFilters, preConfiguredTokenizers);
|
||||||
}
|
}
|
||||||
|
|
||||||
HunspellService getHunspellService() {
|
HunspellService getHunspellService() {
|
||||||
|
@ -287,6 +288,37 @@ public final class AnalysisModule {
|
||||||
return unmodifiableMap(preConfiguredTokenFilters.getRegistry());
|
return unmodifiableMap(preConfiguredTokenFilters.getRegistry());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Map<String, PreConfiguredTokenizer> setupPreConfiguredTokenizers(List<AnalysisPlugin> plugins) {
|
||||||
|
NamedRegistry<PreConfiguredTokenizer> preConfiguredTokenizers = new NamedRegistry<>("pre-configured tokenizer");
|
||||||
|
|
||||||
|
// Temporary shim to register old style pre-configured tokenizers
|
||||||
|
for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) {
|
||||||
|
String name = tokenizer.name().toLowerCase(Locale.ROOT);
|
||||||
|
PreConfiguredTokenizer preConfigured;
|
||||||
|
switch (tokenizer.getCachingStrategy()) {
|
||||||
|
case ONE:
|
||||||
|
preConfigured = PreConfiguredTokenizer.singleton(name,
|
||||||
|
() -> tokenizer.create(Version.CURRENT), null);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
"Caching strategy unsupported by temporary shim [" + tokenizer + "]");
|
||||||
|
}
|
||||||
|
preConfiguredTokenizers.register(name, preConfigured);
|
||||||
|
}
|
||||||
|
// Temporary shim for aliases. TODO deprecate after they are moved
|
||||||
|
preConfiguredTokenizers.register("nGram", preConfiguredTokenizers.getRegistry().get("ngram"));
|
||||||
|
preConfiguredTokenizers.register("edgeNGram", preConfiguredTokenizers.getRegistry().get("edge_ngram"));
|
||||||
|
preConfiguredTokenizers.register("PathHierarchy", preConfiguredTokenizers.getRegistry().get("path_hierarchy"));
|
||||||
|
|
||||||
|
for (AnalysisPlugin plugin: plugins) {
|
||||||
|
for (PreConfiguredTokenizer tokenizer : plugin.getPreConfiguredTokenizers()) {
|
||||||
|
preConfiguredTokenizers.register(tokenizer.getName(), tokenizer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return unmodifiableMap(preConfiguredTokenizers.getRegistry());
|
||||||
|
}
|
||||||
|
|
||||||
private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<AnalysisPlugin> plugins) {
|
private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<AnalysisPlugin> plugins) {
|
||||||
NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = new NamedRegistry<>("tokenizer");
|
NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = new NamedRegistry<>("tokenizer");
|
||||||
tokenizers.register("standard", StandardTokenizerFactory::new);
|
tokenizers.register("standard", StandardTokenizerFactory::new);
|
||||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.indices.analysis;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||||
|
@ -33,6 +32,7 @@ import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||||
import org.apache.lucene.analysis.th.ThaiTokenizer;
|
import org.apache.lucene.analysis.th.ThaiTokenizer;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
import org.elasticsearch.common.regex.Regex;
|
import org.elasticsearch.common.regex.Regex;
|
||||||
|
import org.elasticsearch.index.analysis.CustomNormalizerProvider;
|
||||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||||
|
@ -42,21 +42,21 @@ import java.util.Locale;
|
||||||
|
|
||||||
public enum PreBuiltTokenizers {
|
public enum PreBuiltTokenizers {
|
||||||
|
|
||||||
STANDARD(CachingStrategy.LUCENE) {
|
STANDARD(CachingStrategy.ONE) {
|
||||||
@Override
|
@Override
|
||||||
protected Tokenizer create(Version version) {
|
protected Tokenizer create(Version version) {
|
||||||
return new StandardTokenizer();
|
return new StandardTokenizer();
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
CLASSIC(CachingStrategy.LUCENE) {
|
CLASSIC(CachingStrategy.ONE) {
|
||||||
@Override
|
@Override
|
||||||
protected Tokenizer create(Version version) {
|
protected Tokenizer create(Version version) {
|
||||||
return new ClassicTokenizer();
|
return new ClassicTokenizer();
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
UAX_URL_EMAIL(CachingStrategy.LUCENE) {
|
UAX_URL_EMAIL(CachingStrategy.ONE) {
|
||||||
@Override
|
@Override
|
||||||
protected Tokenizer create(Version version) {
|
protected Tokenizer create(Version version) {
|
||||||
return new UAX29URLEmailTokenizer();
|
return new UAX29URLEmailTokenizer();
|
||||||
|
@ -77,39 +77,28 @@ public enum PreBuiltTokenizers {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
LETTER(CachingStrategy.LUCENE) {
|
LETTER(CachingStrategy.ONE) {
|
||||||
@Override
|
@Override
|
||||||
protected Tokenizer create(Version version) {
|
protected Tokenizer create(Version version) {
|
||||||
return new LetterTokenizer();
|
return new LetterTokenizer();
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
LOWERCASE(CachingStrategy.LUCENE) {
|
WHITESPACE(CachingStrategy.ONE) {
|
||||||
@Override
|
|
||||||
protected Tokenizer create(Version version) {
|
|
||||||
return new LowerCaseTokenizer();
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
protected TokenFilterFactory getMultiTermComponent(Version version) {
|
|
||||||
return PreBuiltTokenFilters.LOWERCASE.getTokenFilterFactory(version);
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
WHITESPACE(CachingStrategy.LUCENE) {
|
|
||||||
@Override
|
@Override
|
||||||
protected Tokenizer create(Version version) {
|
protected Tokenizer create(Version version) {
|
||||||
return new WhitespaceTokenizer();
|
return new WhitespaceTokenizer();
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
NGRAM(CachingStrategy.LUCENE) {
|
NGRAM(CachingStrategy.ONE) {
|
||||||
@Override
|
@Override
|
||||||
protected Tokenizer create(Version version) {
|
protected Tokenizer create(Version version) {
|
||||||
return new NGramTokenizer();
|
return new NGramTokenizer();
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
EDGE_NGRAM(CachingStrategy.LUCENE) {
|
EDGE_NGRAM(CachingStrategy.ONE) {
|
||||||
@Override
|
@Override
|
||||||
protected Tokenizer create(Version version) {
|
protected Tokenizer create(Version version) {
|
||||||
return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
|
return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
|
||||||
|
@ -139,14 +128,23 @@ public enum PreBuiltTokenizers {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected final PreBuiltCacheFactory.PreBuiltCache<TokenizerFactory> cache;
|
protected final PreBuiltCacheFactory.PreBuiltCache<TokenizerFactory> cache;
|
||||||
|
private final CachingStrategy cachingStrategy;
|
||||||
|
|
||||||
PreBuiltTokenizers(CachingStrategy cachingStrategy) {
|
PreBuiltTokenizers(CachingStrategy cachingStrategy) {
|
||||||
|
this.cachingStrategy = cachingStrategy;
|
||||||
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
|
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public CachingStrategy getCachingStrategy() {
|
||||||
|
return cachingStrategy;
|
||||||
|
}
|
||||||
|
|
||||||
private interface MultiTermAwareTokenizerFactory extends TokenizerFactory, MultiTermAwareComponent {}
|
private interface MultiTermAwareTokenizerFactory extends TokenizerFactory, MultiTermAwareComponent {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Old style resolution for {@link TokenizerFactory}. Exists entirely to keep
|
||||||
|
* {@link CustomNormalizerProvider#build(java.util.Map, java.util.Map)} working during the migration.
|
||||||
|
*/
|
||||||
public synchronized TokenizerFactory getTokenizerFactory(final Version version) {
|
public synchronized TokenizerFactory getTokenizerFactory(final Version version) {
|
||||||
TokenizerFactory tokenizerFactory = cache.get(version);
|
TokenizerFactory tokenizerFactory = cache.get(version);
|
||||||
if (tokenizerFactory == null) {
|
if (tokenizerFactory == null) {
|
||||||
|
@ -186,17 +184,4 @@ public enum PreBuiltTokenizers {
|
||||||
|
|
||||||
return tokenizerFactory;
|
return tokenizerFactory;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Get a pre built Tokenizer by its name or fallback to the default one
|
|
||||||
* @param name Tokenizer name
|
|
||||||
* @param defaultTokenizer default Tokenizer if name not found
|
|
||||||
*/
|
|
||||||
public static PreBuiltTokenizers getOrDefault(String name, PreBuiltTokenizers defaultTokenizer) {
|
|
||||||
try {
|
|
||||||
return valueOf(name.toUpperCase(Locale.ROOT));
|
|
||||||
} catch (IllegalArgumentException e) {
|
|
||||||
return defaultTokenizer;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,24 +22,21 @@ package org.elasticsearch.plugins;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharFilter;
|
import org.apache.lucene.analysis.CharFilter;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.elasticsearch.Version;
|
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.index.analysis.AnalyzerProvider;
|
import org.elasticsearch.index.analysis.AnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||||
|
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
|
||||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.function.BiFunction;
|
|
||||||
|
|
||||||
import static java.util.Collections.emptyList;
|
import static java.util.Collections.emptyList;
|
||||||
import static java.util.Collections.emptyMap;
|
import static java.util.Collections.emptyMap;
|
||||||
|
@ -95,12 +92,19 @@ public interface AnalysisPlugin {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Override to add additional pre-configured token filters.
|
* Override to add additional pre-configured {@link TokenFilter}s.
|
||||||
*/
|
*/
|
||||||
default List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
|
default List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
|
||||||
return emptyList();
|
return emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Override to add additional pre-configured {@link Tokenizer}.
|
||||||
|
*/
|
||||||
|
default List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
|
||||||
|
return emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Override to add additional hunspell {@link org.apache.lucene.analysis.hunspell.Dictionary}s.
|
* Override to add additional hunspell {@link org.apache.lucene.analysis.hunspell.Dictionary}s.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -123,7 +123,8 @@ public class IndexModuleTests extends ESTestCase {
|
||||||
indexSettings = IndexSettingsModule.newIndexSettings("foo", settings);
|
indexSettings = IndexSettingsModule.newIndexSettings("foo", settings);
|
||||||
index = indexSettings.getIndex();
|
index = indexSettings.getIndex();
|
||||||
environment = new Environment(settings);
|
environment = new Environment(settings);
|
||||||
emptyAnalysisRegistry = new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap());
|
emptyAnalysisRegistry = new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(),
|
||||||
|
emptyMap(), emptyMap());
|
||||||
threadPool = new TestThreadPool("test");
|
threadPool = new TestThreadPool("test");
|
||||||
circuitBreakerService = new NoneCircuitBreakerService();
|
circuitBreakerService = new NoneCircuitBreakerService();
|
||||||
bigArrays = new BigArrays(settings, circuitBreakerService);
|
bigArrays = new BigArrays(settings, circuitBreakerService);
|
||||||
|
|
|
@ -41,7 +41,6 @@ import org.elasticsearch.test.VersionUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
|
||||||
|
|
||||||
import static java.util.Collections.emptyMap;
|
import static java.util.Collections.emptyMap;
|
||||||
import static java.util.Collections.singletonList;
|
import static java.util.Collections.singletonList;
|
||||||
|
@ -50,25 +49,29 @@ import static org.hamcrest.Matchers.equalTo;
|
||||||
import static org.hamcrest.Matchers.instanceOf;
|
import static org.hamcrest.Matchers.instanceOf;
|
||||||
|
|
||||||
public class AnalysisRegistryTests extends ESTestCase {
|
public class AnalysisRegistryTests extends ESTestCase {
|
||||||
|
|
||||||
private Environment emptyEnvironment;
|
|
||||||
private AnalysisRegistry emptyRegistry;
|
private AnalysisRegistry emptyRegistry;
|
||||||
private IndexSettings emptyIndexSettingsOfCurrentVersion;
|
|
||||||
|
|
||||||
private static AnalyzerProvider<?> analyzerProvider(final String name) {
|
private static AnalyzerProvider<?> analyzerProvider(final String name) {
|
||||||
return new PreBuiltAnalyzerProvider(name, AnalyzerScope.INDEX, new EnglishAnalyzer());
|
return new PreBuiltAnalyzerProvider(name, AnalyzerScope.INDEX, new EnglishAnalyzer());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static AnalysisRegistry emptyAnalysisRegistry(Settings settings) {
|
||||||
|
return new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(),
|
||||||
|
emptyMap());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IndexSettings indexSettingsOfCurrentVersion(Settings.Builder settings) {
|
||||||
|
return IndexSettingsModule.newIndexSettings("index", settings
|
||||||
|
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||||
|
.build());
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
emptyEnvironment = new Environment(Settings.builder()
|
emptyRegistry = emptyAnalysisRegistry(Settings.builder()
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.build());
|
.build());
|
||||||
emptyRegistry = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap());
|
|
||||||
emptyIndexSettingsOfCurrentVersion = IndexSettingsModule.newIndexSettings("index", Settings.builder()
|
|
||||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
|
||||||
.build());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testDefaultAnalyzers() throws IOException {
|
public void testDefaultAnalyzers() throws IOException {
|
||||||
|
@ -191,12 +194,8 @@ public class AnalysisRegistryTests extends ESTestCase {
|
||||||
Settings indexSettings = Settings.builder()
|
Settings indexSettings = Settings.builder()
|
||||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
|
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
|
||||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
||||||
IndexAnalyzers indexAnalyzers =
|
IndexAnalyzers indexAnalyzers = emptyAnalysisRegistry(settings).build(idxSettings);
|
||||||
new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())
|
IndexAnalyzers otherIndexAnalyzers = emptyAnalysisRegistry(settings).build(idxSettings);
|
||||||
.build(idxSettings);
|
|
||||||
IndexAnalyzers otherIndexAnalyzers =
|
|
||||||
new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())
|
|
||||||
.build(idxSettings);
|
|
||||||
final int numIters = randomIntBetween(5, 20);
|
final int numIters = randomIntBetween(5, 20);
|
||||||
for (int i = 0; i < numIters; i++) {
|
for (int i = 0; i < numIters; i++) {
|
||||||
PreBuiltAnalyzers preBuiltAnalyzers = RandomPicks.randomFrom(random(), PreBuiltAnalyzers.values());
|
PreBuiltAnalyzers preBuiltAnalyzers = RandomPicks.randomFrom(random(), PreBuiltAnalyzers.values());
|
||||||
|
@ -204,22 +203,6 @@ public class AnalysisRegistryTests extends ESTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPreConfiguredTokenFiltersAreCached() throws IOException {
|
|
||||||
AtomicBoolean built = new AtomicBoolean(false);
|
|
||||||
PreConfiguredTokenFilter assertsBuiltOnce = PreConfiguredTokenFilter.singleton("asserts_built_once", false, tokenStream -> {
|
|
||||||
if (false == built.compareAndSet(false, true)) {
|
|
||||||
fail("Attempted to build the token filter twice when it should have been cached");
|
|
||||||
}
|
|
||||||
return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET);
|
|
||||||
});
|
|
||||||
try (AnalysisRegistry registryWithPreBuiltTokenFilter = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(),
|
|
||||||
emptyMap(), emptyMap(), singletonMap("asserts_built_once", assertsBuiltOnce))) {
|
|
||||||
IndexAnalyzers indexAnalyzers = registryWithPreBuiltTokenFilter.build(emptyIndexSettingsOfCurrentVersion);
|
|
||||||
IndexAnalyzers otherIndexAnalyzers = registryWithPreBuiltTokenFilter.build(emptyIndexSettingsOfCurrentVersion);
|
|
||||||
assertSame(indexAnalyzers.get("asserts_built_once"), otherIndexAnalyzers.get("asserts_built_once"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testNoTypeOrTokenizerErrorMessage() throws IOException {
|
public void testNoTypeOrTokenizerErrorMessage() throws IOException {
|
||||||
Version version = VersionUtils.randomVersion(random());
|
Version version = VersionUtils.randomVersion(random());
|
||||||
Settings settings = Settings
|
Settings settings = Settings
|
||||||
|
@ -231,14 +214,12 @@ public class AnalysisRegistryTests extends ESTestCase {
|
||||||
.build();
|
.build();
|
||||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||||
|
|
||||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () ->
|
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> emptyAnalysisRegistry(settings).build(idxSettings));
|
||||||
new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())
|
|
||||||
.build(idxSettings));
|
|
||||||
assertThat(e.getMessage(), equalTo("analyzer [test_analyzer] must specify either an analyzer type, or a tokenizer"));
|
assertThat(e.getMessage(), equalTo("analyzer [test_analyzer] must specify either an analyzer type, or a tokenizer"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCloseIndexAnalyzersMultipleTimes() throws IOException {
|
public void testCloseIndexAnalyzersMultipleTimes() throws IOException {
|
||||||
IndexAnalyzers indexAnalyzers = emptyRegistry.build(emptyIndexSettingsOfCurrentVersion);
|
IndexAnalyzers indexAnalyzers = emptyRegistry.build(indexSettingsOfCurrentVersion(Settings.builder()));
|
||||||
indexAnalyzers.close();
|
indexAnalyzers.close();
|
||||||
indexAnalyzers.close();
|
indexAnalyzers.close();
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,7 +23,6 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
|
||||||
import org.apache.lucene.analysis.hunspell.Dictionary;
|
import org.apache.lucene.analysis.hunspell.Dictionary;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
@ -37,12 +36,12 @@ import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.index.analysis.Analysis;
|
import org.elasticsearch.index.analysis.Analysis;
|
||||||
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
||||||
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
|
|
||||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
||||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||||
|
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
|
||||||
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
|
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
|
||||||
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
|
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
|
@ -57,7 +56,6 @@ import org.hamcrest.MatcherAssert;
|
||||||
import java.io.BufferedWriter;
|
import java.io.BufferedWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.StringReader;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
@ -164,18 +162,6 @@ public class AnalysisModuleTests extends ESTestCase {
|
||||||
assertEquals(org.apache.lucene.util.Version.fromBits(3,6,0), indexAnalyzers.get("custom7").analyzer().getVersion());
|
assertEquals(org.apache.lucene.util.Version.fromBits(3,6,0), indexAnalyzers.get("custom7").analyzer().getVersion());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertTokenFilter(String name, Class<?> clazz) throws IOException {
|
|
||||||
Settings settings = Settings.builder()
|
|
||||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
|
|
||||||
TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
|
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get(name);
|
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
|
||||||
tokenizer.setReader(new StringReader("foo bar"));
|
|
||||||
TokenStream stream = tokenFilter.create(tokenizer);
|
|
||||||
assertThat(stream, instanceOf(clazz));
|
|
||||||
}
|
|
||||||
|
|
||||||
private void testSimpleConfiguration(Settings settings) throws IOException {
|
private void testSimpleConfiguration(Settings settings) throws IOException {
|
||||||
IndexAnalyzers indexAnalyzers = getIndexAnalyzers(settings);
|
IndexAnalyzers indexAnalyzers = getIndexAnalyzers(settings);
|
||||||
Analyzer analyzer = indexAnalyzers.get("custom1").analyzer();
|
Analyzer analyzer = indexAnalyzers.get("custom1").analyzer();
|
||||||
|
@ -269,27 +255,6 @@ public class AnalysisModuleTests extends ESTestCase {
|
||||||
* and that do not vary based on version at all.
|
* and that do not vary based on version at all.
|
||||||
*/
|
*/
|
||||||
public void testPluginPreConfiguredTokenFilters() throws IOException {
|
public void testPluginPreConfiguredTokenFilters() throws IOException {
|
||||||
// Simple token filter that appends text to the term
|
|
||||||
final class AppendTokenFilter extends TokenFilter {
|
|
||||||
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
|
|
||||||
private final char[] appendMe;
|
|
||||||
|
|
||||||
protected AppendTokenFilter(TokenStream input, String appendMe) {
|
|
||||||
super(input);
|
|
||||||
this.appendMe = appendMe.toCharArray();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean incrementToken() throws IOException {
|
|
||||||
if (false == input.incrementToken()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
term.resizeBuffer(term.length() + appendMe.length);
|
|
||||||
System.arraycopy(appendMe, 0, term.buffer(), term.length(), appendMe.length);
|
|
||||||
term.setLength(term.length() + appendMe.length);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
boolean noVersionSupportsMultiTerm = randomBoolean();
|
boolean noVersionSupportsMultiTerm = randomBoolean();
|
||||||
boolean luceneVersionSupportsMultiTerm = randomBoolean();
|
boolean luceneVersionSupportsMultiTerm = randomBoolean();
|
||||||
boolean elasticsearchVersionSupportsMultiTerm = randomBoolean();
|
boolean elasticsearchVersionSupportsMultiTerm = randomBoolean();
|
||||||
|
@ -329,6 +294,82 @@ public class AnalysisModuleTests extends ESTestCase {
|
||||||
analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString());
|
analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests that plugins can register pre-configured token filters that vary in behavior based on Elasticsearch version, Lucene version,
|
||||||
|
* and that do not vary based on version at all.
|
||||||
|
*/
|
||||||
|
public void testPluginPreConfiguredTokenizers() throws IOException {
|
||||||
|
boolean noVersionSupportsMultiTerm = randomBoolean();
|
||||||
|
boolean luceneVersionSupportsMultiTerm = randomBoolean();
|
||||||
|
boolean elasticsearchVersionSupportsMultiTerm = randomBoolean();
|
||||||
|
|
||||||
|
// Simple tokenizer that always spits out a single token with some preconfigured characters
|
||||||
|
final class FixedTokenizer extends Tokenizer {
|
||||||
|
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
|
||||||
|
private final char[] chars;
|
||||||
|
private boolean read = false;
|
||||||
|
|
||||||
|
protected FixedTokenizer(String chars) {
|
||||||
|
this.chars = chars.toCharArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (read) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
clearAttributes();
|
||||||
|
read = true;
|
||||||
|
term.resizeBuffer(chars.length);
|
||||||
|
System.arraycopy(chars, 0, term.buffer(), 0, chars.length);
|
||||||
|
term.setLength(chars.length);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
read = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
AnalysisRegistry registry = new AnalysisModule(new Environment(emptyNodeSettings), singletonList(new AnalysisPlugin() {
|
||||||
|
@Override
|
||||||
|
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
|
||||||
|
return Arrays.asList(
|
||||||
|
PreConfiguredTokenizer.singleton("no_version", () -> new FixedTokenizer("no_version"),
|
||||||
|
noVersionSupportsMultiTerm ? () -> AppendTokenFilter.factoryForSuffix("no_version") : null),
|
||||||
|
PreConfiguredTokenizer.luceneVersion("lucene_version",
|
||||||
|
luceneVersion -> new FixedTokenizer(luceneVersion.toString()),
|
||||||
|
luceneVersionSupportsMultiTerm ?
|
||||||
|
luceneVersion -> AppendTokenFilter.factoryForSuffix(luceneVersion.toString()) : null),
|
||||||
|
PreConfiguredTokenizer.elasticsearchVersion("elasticsearch_version",
|
||||||
|
esVersion -> new FixedTokenizer(esVersion.toString()),
|
||||||
|
elasticsearchVersionSupportsMultiTerm ?
|
||||||
|
esVersion -> AppendTokenFilter.factoryForSuffix(esVersion.toString()) : null)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
})).getAnalysisRegistry();
|
||||||
|
|
||||||
|
Version version = VersionUtils.randomVersion(random());
|
||||||
|
IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder()
|
||||||
|
.put("index.analysis.analyzer.no_version.tokenizer", "no_version")
|
||||||
|
.put("index.analysis.analyzer.lucene_version.tokenizer", "lucene_version")
|
||||||
|
.put("index.analysis.analyzer.elasticsearch_version.tokenizer", "elasticsearch_version")
|
||||||
|
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
|
||||||
|
.build());
|
||||||
|
assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] {"no_version"});
|
||||||
|
assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] {version.luceneVersion.toString()});
|
||||||
|
assertTokenStreamContents(analyzers.get("elasticsearch_version").tokenStream("", "test"), new String[] {version.toString()});
|
||||||
|
|
||||||
|
// These are current broken by https://github.com/elastic/elasticsearch/issues/24752
|
||||||
|
// assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""),
|
||||||
|
// analyzers.get("no_version").normalize("", "test").utf8ToString());
|
||||||
|
// assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""),
|
||||||
|
// analyzers.get("lucene_version").normalize("", "test").utf8ToString());
|
||||||
|
// assertEquals("test" + (elasticsearchVersionSupportsMultiTerm ? version.toString() : ""),
|
||||||
|
// analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString());
|
||||||
|
}
|
||||||
|
|
||||||
public void testRegisterHunspellDictionary() throws Exception {
|
public void testRegisterHunspellDictionary() throws Exception {
|
||||||
Settings settings = Settings.builder()
|
Settings settings = Settings.builder()
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
|
@ -349,4 +390,41 @@ public class AnalysisModuleTests extends ESTestCase {
|
||||||
}));
|
}));
|
||||||
assertSame(dictionary, module.getHunspellService().getDictionary("foo"));
|
assertSame(dictionary, module.getHunspellService().getDictionary("foo"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Simple token filter that appends text to the term
|
||||||
|
private static class AppendTokenFilter extends TokenFilter {
|
||||||
|
public static TokenFilterFactory factoryForSuffix(String suffix) {
|
||||||
|
return new TokenFilterFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return suffix;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new AppendTokenFilter(tokenStream, suffix);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
|
||||||
|
private final char[] appendMe;
|
||||||
|
|
||||||
|
protected AppendTokenFilter(TokenStream input, String appendMe) {
|
||||||
|
super(input);
|
||||||
|
this.appendMe = appendMe.toCharArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (false == input.incrementToken()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
term.resizeBuffer(term.length() + appendMe.length);
|
||||||
|
System.arraycopy(appendMe, 0, term.buffer(), term.length(), appendMe.length);
|
||||||
|
term.setLength(term.length() + appendMe.length);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,7 +20,9 @@
|
||||||
package org.elasticsearch.analysis.common;
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||||
import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
||||||
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
||||||
|
@ -29,6 +31,7 @@ import org.apache.lucene.analysis.cjk.CJKWidthFilter;
|
||||||
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
|
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
|
||||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||||
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||||
|
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.core.UpperCaseFilter;
|
import org.apache.lucene.analysis.core.UpperCaseFilter;
|
||||||
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
||||||
|
@ -66,6 +69,7 @@ import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
|
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
|
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||||
|
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||||
|
@ -174,4 +178,21 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
||||||
| WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
|
| WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
|
||||||
return filters;
|
return filters;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
|
||||||
|
List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
|
||||||
|
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", LowerCaseTokenizer::new, () -> new TokenFilterFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return "lowercase";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new LowerCaseFilter(tokenStream);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
return tokenizers;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -117,6 +117,13 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||||
return filters;
|
return filters;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Map<String, Class<?>> getPreConfiguredTokenizers() {
|
||||||
|
Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenFilters());
|
||||||
|
|
||||||
|
return filters;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fails if a tokenizer is marked in the superclass with {@link MovedToAnalysisCommon} but
|
* Fails if a tokenizer is marked in the superclass with {@link MovedToAnalysisCommon} but
|
||||||
* hasn't been marked in this class with its proper factory.
|
* hasn't been marked in this class with its proper factory.
|
||||||
|
|
|
@ -66,6 +66,7 @@ import org.elasticsearch.index.analysis.PatternTokenizerFactory;
|
||||||
import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
|
import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory;
|
import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||||
|
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
|
||||||
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
|
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
|
import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
|
import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
|
||||||
|
@ -95,6 +96,7 @@ import java.util.Collection;
|
||||||
import java.util.EnumMap;
|
import java.util.EnumMap;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -103,6 +105,7 @@ import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import static java.util.Collections.singletonList;
|
import static java.util.Collections.singletonList;
|
||||||
|
import static org.hamcrest.Matchers.typeCompatibleWith;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Alerts us if new analysis components are added to Lucene, so we don't miss them.
|
* Alerts us if new analysis components are added to Lucene, so we don't miss them.
|
||||||
|
@ -148,26 +151,6 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
||||||
.put("simplepatternsplit", Void.class)
|
.put("simplepatternsplit", Void.class)
|
||||||
.immutableMap();
|
.immutableMap();
|
||||||
|
|
||||||
static final Map<PreBuiltTokenizers, Class<?>> PREBUILT_TOKENIZERS;
|
|
||||||
static {
|
|
||||||
PREBUILT_TOKENIZERS = new EnumMap<>(PreBuiltTokenizers.class);
|
|
||||||
for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) {
|
|
||||||
Class<?> luceneFactoryClazz;
|
|
||||||
switch (tokenizer) {
|
|
||||||
case UAX_URL_EMAIL:
|
|
||||||
luceneFactoryClazz = org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class;
|
|
||||||
break;
|
|
||||||
case PATH_HIERARCHY:
|
|
||||||
luceneFactoryClazz = Void.class;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
luceneFactoryClazz = org.apache.lucene.analysis.util.TokenizerFactory.lookupClass(
|
|
||||||
toCamelCase(tokenizer.getTokenizerFactory(Version.CURRENT).name()));
|
|
||||||
}
|
|
||||||
PREBUILT_TOKENIZERS.put(tokenizer, luceneFactoryClazz);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()
|
static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()
|
||||||
// exposed in ES
|
// exposed in ES
|
||||||
.put("apostrophe", ApostropheFilterFactory.class)
|
.put("apostrophe", ApostropheFilterFactory.class)
|
||||||
|
@ -319,22 +302,26 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
||||||
this.plugin = Objects.requireNonNull(plugin, "plugin is required. use an empty plugin for core");
|
this.plugin = Objects.requireNonNull(plugin, "plugin is required. use an empty plugin for core");
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Map<String, Class<?>> getTokenizers() {
|
protected Map<String, Class<?>> getCharFilters() {
|
||||||
return KNOWN_TOKENIZERS;
|
return KNOWN_CHARFILTERS;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Map<String, Class<?>> getTokenFilters() {
|
protected Map<String, Class<?>> getTokenFilters() {
|
||||||
return KNOWN_TOKENFILTERS;
|
return KNOWN_TOKENFILTERS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected Map<String, Class<?>> getTokenizers() {
|
||||||
|
return KNOWN_TOKENIZERS;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Map containing pre-configured token filters that should be available
|
* Map containing pre-configured token filters that should be available
|
||||||
* after installing this plugin. The map is from the name of the token
|
* after installing this plugin. The map is from the name of the token
|
||||||
* filter to the class of the Lucene {@link TokenFilterFactory} that it
|
* filter to the class of the Lucene {@link TokenFilterFactory} that it
|
||||||
* is emulating. If the Lucene filter factory is {@code null} then the
|
* is emulating. If the Lucene {@linkplain TokenFilterFactory} is
|
||||||
* test will look it up for you from the name. If there is no Lucene
|
* {@code null} then the test will look it up for you from the name. If
|
||||||
* {@linkplain TokenFilterFactory} then the right hand side should
|
* there is no Lucene {@linkplain TokenFilterFactory} then the right
|
||||||
* be {@link Void}.
|
* hand side should be {@link Void}.
|
||||||
*/
|
*/
|
||||||
protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
|
protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
|
||||||
Map<String, Class<?>> filters = new HashMap<>();
|
Map<String, Class<?>> filters = new HashMap<>();
|
||||||
|
@ -343,8 +330,33 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
||||||
return filters;
|
return filters;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Map<String, Class<?>> getCharFilters() {
|
/**
|
||||||
return KNOWN_CHARFILTERS;
|
* Map containing pre-configured tokenizers that should be available
|
||||||
|
* after installing this plugin. The map is from the name of the token
|
||||||
|
* filter to the class of the Lucene {@link TokenizerFactory} that it
|
||||||
|
* is emulating. If the Lucene {@linkplain TokenizerFactory} is
|
||||||
|
* {@code null} then the test will look it up for you from the name.
|
||||||
|
* If there is no Lucene {@linkplain TokenizerFactory} then the right
|
||||||
|
* hand side should be {@link Void}.
|
||||||
|
*/
|
||||||
|
protected Map<String, Class<?>> getPreConfiguredTokenizers() {
|
||||||
|
Map<String, Class<?>> tokenizers = new HashMap<>();
|
||||||
|
// TODO drop this temporary shim when all the old style tokenizers have been migrated to new style
|
||||||
|
for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) {
|
||||||
|
final Class<?> luceneFactoryClazz;
|
||||||
|
switch (tokenizer) {
|
||||||
|
case UAX_URL_EMAIL:
|
||||||
|
luceneFactoryClazz = org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class;
|
||||||
|
break;
|
||||||
|
case PATH_HIERARCHY:
|
||||||
|
luceneFactoryClazz = Void.class;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
luceneFactoryClazz = null;
|
||||||
|
}
|
||||||
|
tokenizers.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClazz);
|
||||||
|
}
|
||||||
|
return tokenizers;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testTokenizers() {
|
public void testTokenizers() {
|
||||||
|
@ -421,21 +433,8 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
||||||
Collection<Object> expected = new HashSet<>();
|
Collection<Object> expected = new HashSet<>();
|
||||||
Collection<Object> actual = new HashSet<>();
|
Collection<Object> actual = new HashSet<>();
|
||||||
|
|
||||||
for (Map.Entry<PreBuiltTokenizers, Class<?>> entry : PREBUILT_TOKENIZERS.entrySet()) {
|
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters =
|
||||||
PreBuiltTokenizers tokenizer = entry.getKey();
|
AnalysisModule.setupPreConfiguredTokenFilters(singletonList(plugin));
|
||||||
Class<?> luceneFactory = entry.getValue();
|
|
||||||
if (luceneFactory == Void.class) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
assertTrue(TokenizerFactory.class.isAssignableFrom(luceneFactory));
|
|
||||||
if (tokenizer.getTokenizerFactory(Version.CURRENT) instanceof MultiTermAwareComponent) {
|
|
||||||
actual.add(tokenizer);
|
|
||||||
}
|
|
||||||
if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) {
|
|
||||||
expected.add(tokenizer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Map<String, PreConfiguredTokenFilter> preBuiltTokenFilters = AnalysisModule.setupPreConfiguredTokenFilters(singletonList(plugin));
|
|
||||||
for (Map.Entry<String, Class<?>> entry : getPreConfiguredTokenFilters().entrySet()) {
|
for (Map.Entry<String, Class<?>> entry : getPreConfiguredTokenFilters().entrySet()) {
|
||||||
String name = entry.getKey();
|
String name = entry.getKey();
|
||||||
Class<?> luceneFactory = entry.getValue();
|
Class<?> luceneFactory = entry.getValue();
|
||||||
|
@ -445,8 +444,8 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
||||||
if (luceneFactory == null) {
|
if (luceneFactory == null) {
|
||||||
luceneFactory = TokenFilterFactory.lookupClass(toCamelCase(name));
|
luceneFactory = TokenFilterFactory.lookupClass(toCamelCase(name));
|
||||||
}
|
}
|
||||||
assertTrue(TokenFilterFactory.class.isAssignableFrom(luceneFactory));
|
assertThat(luceneFactory, typeCompatibleWith(TokenFilterFactory.class));
|
||||||
PreConfiguredTokenFilter filter = preBuiltTokenFilters.get(name);
|
PreConfiguredTokenFilter filter = preConfiguredTokenFilters.get(name);
|
||||||
assertNotNull("test claims pre built token filter [" + name + "] should be available but it wasn't", filter);
|
assertNotNull("test claims pre built token filter [" + name + "] should be available but it wasn't", filter);
|
||||||
if (filter.shouldUseFilterForMultitermQueries()) {
|
if (filter.shouldUseFilterForMultitermQueries()) {
|
||||||
actual.add("token filter [" + name + "]");
|
actual.add("token filter [" + name + "]");
|
||||||
|
@ -455,6 +454,25 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
||||||
expected.add("token filter [" + name + "]");
|
expected.add("token filter [" + name + "]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers = AnalysisModule.setupPreConfiguredTokenizers(singletonList(plugin));
|
||||||
|
for (Map.Entry<String, Class<?>> entry : getPreConfiguredTokenizers().entrySet()) {
|
||||||
|
String name = entry.getKey();
|
||||||
|
Class<?> luceneFactory = entry.getValue();
|
||||||
|
if (luceneFactory == Void.class) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (luceneFactory == null) {
|
||||||
|
luceneFactory = TokenizerFactory.lookupClass(toCamelCase(name));
|
||||||
|
}
|
||||||
|
assertThat(luceneFactory, typeCompatibleWith(TokenizerFactory.class));
|
||||||
|
PreConfiguredTokenizer tokenizer = preConfiguredTokenizers.get(name);
|
||||||
|
if (tokenizer.hasMultiTermComponent()) {
|
||||||
|
actual.add(tokenizer);
|
||||||
|
}
|
||||||
|
if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) {
|
||||||
|
expected.add(tokenizer);
|
||||||
|
}
|
||||||
|
}
|
||||||
for (Map.Entry<PreBuiltCharFilters, Class<?>> entry : PREBUILT_CHARFILTERS.entrySet()) {
|
for (Map.Entry<PreBuiltCharFilters, Class<?>> entry : PREBUILT_CHARFILTERS.entrySet()) {
|
||||||
PreBuiltCharFilters charFilter = entry.getKey();
|
PreBuiltCharFilters charFilter = entry.getKey();
|
||||||
Class<?> luceneFactory = entry.getValue();
|
Class<?> luceneFactory = entry.getValue();
|
||||||
|
|
Loading…
Reference in New Issue