Move pre-configured "keyword" tokenizer to the analysis-common module (#24863)
Moves the keyword tokenizer to the analysis-common module. The keyword tokenizer is special because it is used by CustomNormalizerProvider so I pulled it out into its own PR. To get the move to work I've reworked the lookup from static to one using the AnalysisRegistry. This seems safe enough. Part of #23658.
This commit is contained in:
parent
7b358190d6
commit
ecc87f613f
|
@ -466,7 +466,7 @@ public final class AnalysisRegistry implements Closeable {
|
||||||
}
|
}
|
||||||
for (Map.Entry<String, AnalyzerProvider<?>> entry : normalizerProviders.entrySet()) {
|
for (Map.Entry<String, AnalyzerProvider<?>> entry : normalizerProviders.entrySet()) {
|
||||||
processNormalizerFactory(deprecationLogger, indexSettings, entry.getKey(), entry.getValue(), normalizers,
|
processNormalizerFactory(deprecationLogger, indexSettings, entry.getKey(), entry.getValue(), normalizers,
|
||||||
tokenFilterFactoryFactories, charFilterFactoryFactories);
|
tokenizerFactoryFactories.get("keyword"), tokenFilterFactoryFactories, charFilterFactoryFactories);
|
||||||
}
|
}
|
||||||
for (Map.Entry<String, NamedAnalyzer> entry : analyzerAliases.entrySet()) {
|
for (Map.Entry<String, NamedAnalyzer> entry : analyzerAliases.entrySet()) {
|
||||||
String key = entry.getKey();
|
String key = entry.getKey();
|
||||||
|
@ -585,10 +585,11 @@ public final class AnalysisRegistry implements Closeable {
|
||||||
String name,
|
String name,
|
||||||
AnalyzerProvider<?> normalizerFactory,
|
AnalyzerProvider<?> normalizerFactory,
|
||||||
Map<String, NamedAnalyzer> normalizers,
|
Map<String, NamedAnalyzer> normalizers,
|
||||||
|
TokenizerFactory keywordTokenizerFactory,
|
||||||
Map<String, TokenFilterFactory> tokenFilters,
|
Map<String, TokenFilterFactory> tokenFilters,
|
||||||
Map<String, CharFilterFactory> charFilters) {
|
Map<String, CharFilterFactory> charFilters) {
|
||||||
if (normalizerFactory instanceof CustomNormalizerProvider) {
|
if (normalizerFactory instanceof CustomNormalizerProvider) {
|
||||||
((CustomNormalizerProvider) normalizerFactory).build(charFilters, tokenFilters);
|
((CustomNormalizerProvider) normalizerFactory).build(keywordTokenizerFactory, charFilters, tokenFilters);
|
||||||
}
|
}
|
||||||
Analyzer normalizerF = normalizerFactory.get();
|
Analyzer normalizerF = normalizerFactory.get();
|
||||||
if (normalizerF == null) {
|
if (normalizerF == null) {
|
||||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.indices.analysis.PreBuiltTokenizers;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -44,7 +43,8 @@ public final class CustomNormalizerProvider extends AbstractIndexAnalyzerProvide
|
||||||
this.analyzerSettings = settings;
|
this.analyzerSettings = settings;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void build(final Map<String, CharFilterFactory> charFilters, final Map<String, TokenFilterFactory> tokenFilters) {
|
public void build(final TokenizerFactory keywordTokenizerFactory, final Map<String, CharFilterFactory> charFilters,
|
||||||
|
final Map<String, TokenFilterFactory> tokenFilters) {
|
||||||
String tokenizerName = analyzerSettings.get("tokenizer");
|
String tokenizerName = analyzerSettings.get("tokenizer");
|
||||||
if (tokenizerName != null) {
|
if (tokenizerName != null) {
|
||||||
throw new IllegalArgumentException("Custom normalizer [" + name() + "] cannot configure a tokenizer");
|
throw new IllegalArgumentException("Custom normalizer [" + name() + "] cannot configure a tokenizer");
|
||||||
|
@ -83,7 +83,7 @@ public final class CustomNormalizerProvider extends AbstractIndexAnalyzerProvide
|
||||||
|
|
||||||
this.customAnalyzer = new CustomAnalyzer(
|
this.customAnalyzer = new CustomAnalyzer(
|
||||||
"keyword",
|
"keyword",
|
||||||
PreBuiltTokenizers.KEYWORD.getTokenizerFactory(indexSettings.getIndexVersionCreated()),
|
keywordTokenizerFactory,
|
||||||
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
|
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
|
||||||
tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()])
|
tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()])
|
||||||
);
|
);
|
||||||
|
|
|
@ -19,7 +19,6 @@
|
||||||
package org.elasticsearch.indices.analysis;
|
package org.elasticsearch.indices.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
|
||||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||||
|
@ -32,10 +31,7 @@ import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||||
import org.apache.lucene.analysis.th.ThaiTokenizer;
|
import org.apache.lucene.analysis.th.ThaiTokenizer;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
import org.elasticsearch.common.regex.Regex;
|
import org.elasticsearch.common.regex.Regex;
|
||||||
import org.elasticsearch.index.analysis.CustomNormalizerProvider;
|
|
||||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
|
||||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
||||||
|
|
||||||
public enum PreBuiltTokenizers {
|
public enum PreBuiltTokenizers {
|
||||||
|
@ -68,13 +64,6 @@ public enum PreBuiltTokenizers {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
KEYWORD(CachingStrategy.ONE) {
|
|
||||||
@Override
|
|
||||||
protected Tokenizer create(Version version) {
|
|
||||||
return new KeywordTokenizer();
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
LETTER(CachingStrategy.ONE) {
|
LETTER(CachingStrategy.ONE) {
|
||||||
@Override
|
@Override
|
||||||
protected Tokenizer create(Version version) {
|
protected Tokenizer create(Version version) {
|
||||||
|
@ -125,50 +114,13 @@ public enum PreBuiltTokenizers {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected final PreBuiltCacheFactory.PreBuiltCache<TokenizerFactory> cache;
|
|
||||||
private final CachingStrategy cachingStrategy;
|
private final CachingStrategy cachingStrategy;
|
||||||
|
|
||||||
PreBuiltTokenizers(CachingStrategy cachingStrategy) {
|
PreBuiltTokenizers(CachingStrategy cachingStrategy) {
|
||||||
this.cachingStrategy = cachingStrategy;
|
this.cachingStrategy = cachingStrategy;
|
||||||
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public CachingStrategy getCachingStrategy() {
|
public CachingStrategy getCachingStrategy() {
|
||||||
return cachingStrategy;
|
return cachingStrategy;
|
||||||
}
|
}
|
||||||
|
|
||||||
private interface MultiTermAwareTokenizerFactory extends TokenizerFactory, MultiTermAwareComponent {}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Old style resolution for {@link TokenizerFactory}. Exists entirely to keep
|
|
||||||
* {@link CustomNormalizerProvider#build(java.util.Map, java.util.Map)} working during the migration.
|
|
||||||
*/
|
|
||||||
public synchronized TokenizerFactory getTokenizerFactory(final Version version) {
|
|
||||||
TokenizerFactory tokenizerFactory = cache.get(version);
|
|
||||||
if (tokenizerFactory == null) {
|
|
||||||
if (getMultiTermComponent(version) != null) {
|
|
||||||
tokenizerFactory = new MultiTermAwareTokenizerFactory() {
|
|
||||||
@Override
|
|
||||||
public Tokenizer create() {
|
|
||||||
return PreBuiltTokenizers.this.create(version);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Object getMultiTermComponent() {
|
|
||||||
return PreBuiltTokenizers.this.getMultiTermComponent(version);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
tokenizerFactory = new TokenizerFactory() {
|
|
||||||
@Override
|
|
||||||
public Tokenizer create() {
|
|
||||||
return PreBuiltTokenizers.this.create(version);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
cache.put(version, tokenizerFactory);
|
|
||||||
}
|
|
||||||
|
|
||||||
return tokenizerFactory;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,6 +32,7 @@ import org.apache.lucene.analysis.cjk.CJKWidthFilter;
|
||||||
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
|
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
|
||||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||||
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.core.UpperCaseFilter;
|
import org.apache.lucene.analysis.core.UpperCaseFilter;
|
||||||
|
@ -215,6 +216,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
||||||
@Override
|
@Override
|
||||||
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
|
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
|
||||||
List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
|
List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
|
||||||
|
tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
|
||||||
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", LowerCaseTokenizer::new, () -> new TokenFilterFactory() {
|
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", LowerCaseTokenizer::new, () -> new TokenFilterFactory() {
|
||||||
@Override
|
@Override
|
||||||
public String name() {
|
public String name() {
|
||||||
|
|
|
@ -145,6 +145,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected Map<String, Class<?>> getPreConfiguredTokenizers() {
|
protected Map<String, Class<?>> getPreConfiguredTokenizers() {
|
||||||
Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenizers());
|
Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenizers());
|
||||||
|
filters.put("keyword", null);
|
||||||
filters.put("lowercase", null);
|
filters.put("lowercase", null);
|
||||||
return filters;
|
return filters;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue