Add name() method to TokenizerFactory (#43909)
This brings TokenizerFactory into line with CharFilterFactory and TokenFilterFactory, and removes the need to pass around tokenizer names when building custom analyzers. As this means that TokenizerFactory is no longer a functional interface, the commit also adds a factory method to TokenizerFactory to make construction simpler.
This commit is contained in:
parent
1b6109517a
commit
4b99255fed
|
@ -0,0 +1,30 @@
|
|||
[[breaking-changes-7.4]]
|
||||
== Breaking changes in 7.4
|
||||
++++
|
||||
<titleabbrev>7.4</titleabbrev>
|
||||
++++
|
||||
|
||||
This section discusses the changes that you need to be aware of when migrating
|
||||
your application to Elasticsearch 7.4.
|
||||
|
||||
See also <<release-highlights>> and <<es-release-notes>>.
|
||||
|
||||
coming[7.4.0]
|
||||
|
||||
//NOTE: The notable-breaking-changes tagged regions are re-used in the
|
||||
//Installation and Upgrade Guide
|
||||
|
||||
//tag::notable-breaking-changes[]
|
||||
|
||||
// end::notable-breaking-changes[]
|
||||
|
||||
[[breaking_74_plugin_changes]]
|
||||
=== Plugins changes
|
||||
|
||||
[float]
|
||||
==== TokenizerFactory changes
|
||||
|
||||
TokenizerFactory now has a `name()` method that must be implemented. Most
|
||||
plugin-provided TokenizerFactory implementations will extend `AbstractTokenizerFactory`,
|
||||
which now takes a `name` parameter in its constructor.
|
||||
|
|
@ -39,7 +39,7 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
|
|||
private boolean tokenizeOnSymbol = false;
|
||||
|
||||
public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
|
||||
for (final String c : settings.getAsList("tokenize_on_chars")) {
|
||||
if (c == null || c.length() == 0) {
|
||||
|
|
|
@ -35,7 +35,7 @@ public class ClassicTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private final int maxTokenLength;
|
||||
|
||||
ClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||
}
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private final CharMatcher matcher;
|
||||
|
||||
EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
|
||||
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
|
||||
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
|
||||
|
|
|
@ -31,7 +31,7 @@ public class KeywordTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private final int bufferSize;
|
||||
|
||||
KeywordTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
bufferSize = settings.getAsInt("buffer_size", 256);
|
||||
}
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
|||
public class LetterTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
LetterTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -85,7 +85,7 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
|
|||
}
|
||||
|
||||
NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
|
||||
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
|
||||
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
|
||||
|
|
|
@ -37,7 +37,7 @@ public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private final boolean reverse;
|
||||
|
||||
PathHierarchyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
bufferSize = settings.getAsInt("buffer_size", 1024);
|
||||
String delimiter = settings.get("delimiter");
|
||||
if (delimiter == null) {
|
||||
|
|
|
@ -35,7 +35,7 @@ public class PatternTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private final int group;
|
||||
|
||||
PatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
|
||||
String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
|
||||
if (sPattern == null) {
|
||||
|
|
|
@ -31,7 +31,7 @@ public class SimplePatternSplitTokenizerFactory extends AbstractTokenizerFactory
|
|||
private final String pattern;
|
||||
|
||||
public SimplePatternSplitTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
|
||||
pattern = settings.get("pattern", "");
|
||||
}
|
||||
|
|
|
@ -31,7 +31,7 @@ public class SimplePatternTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private final String pattern;
|
||||
|
||||
public SimplePatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
|
||||
pattern = settings.get("pattern", "");
|
||||
}
|
||||
|
|
|
@ -116,7 +116,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
|
||||
Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
|
||||
List<TokenFilterFactory> tokenFilters, Function<String, TokenFilterFactory> allFilters) {
|
||||
return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]),
|
||||
return new CustomAnalyzer(tokenizer, charFilters.toArray(new CharFilterFactory[0]),
|
||||
tokenFilters.stream()
|
||||
.map(TokenFilterFactory::getSynonymFilter)
|
||||
.toArray(TokenFilterFactory[]::new));
|
||||
|
|
|
@ -32,7 +32,7 @@ import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
|||
public class ThaiTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
ThaiTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -32,7 +32,7 @@ public class UAX29URLEmailTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private final int maxTokenLength;
|
||||
|
||||
UAX29URLEmailTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||
}
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ public class WhitespaceTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private Integer maxTokenLength;
|
||||
|
||||
WhitespaceTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
maxTokenLength = settings.getAsInt(MAX_TOKEN_LENGTH, StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||
}
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
|||
public class XLowerCaseTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
public XLowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
tokenizer:
|
||||
type: keyword
|
||||
- length: { detail.tokenizer.tokens: 1 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__keyword }
|
||||
- match: { detail.tokenizer.tokens.0.token: Foo Bar! }
|
||||
|
||||
---
|
||||
|
@ -48,7 +48,7 @@
|
|||
type: simple_pattern
|
||||
pattern: "[abcdef0123456789]{4}"
|
||||
- length: { detail.tokenizer.tokens: 2 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__simple_pattern }
|
||||
- match: { detail.tokenizer.tokens.0.token: a6bf }
|
||||
- match: { detail.tokenizer.tokens.1.token: ff61 }
|
||||
|
||||
|
@ -63,7 +63,7 @@
|
|||
type: simple_pattern_split
|
||||
pattern: ==
|
||||
- length: { detail.tokenizer.tokens: 2 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__simple_pattern_split }
|
||||
- match: { detail.tokenizer.tokens.0.token: foo }
|
||||
- match: { detail.tokenizer.tokens.1.token: bar }
|
||||
|
||||
|
@ -77,7 +77,7 @@
|
|||
tokenizer:
|
||||
type: thai
|
||||
- length: { detail.tokenizer.tokens: 2 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__thai }
|
||||
- match: { detail.tokenizer.tokens.0.token: ภาษา }
|
||||
- match: { detail.tokenizer.tokens.1.token: ไทย }
|
||||
|
||||
|
@ -104,7 +104,7 @@
|
|||
min_gram: 3
|
||||
max_gram: 3
|
||||
- length: { detail.tokenizer.tokens: 4 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__ngram }
|
||||
- match: { detail.tokenizer.tokens.0.token: foo }
|
||||
- match: { detail.tokenizer.tokens.1.token: oob }
|
||||
- match: { detail.tokenizer.tokens.2.token: oba }
|
||||
|
@ -120,7 +120,7 @@
|
|||
min_gram: 3
|
||||
max_gram: 3
|
||||
- length: { detail.tokenizer.tokens: 4 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__ngram }
|
||||
- match: { detail.tokenizer.tokens.0.token: foo }
|
||||
- match: { detail.tokenizer.tokens.1.token: oob }
|
||||
- match: { detail.tokenizer.tokens.2.token: oba }
|
||||
|
@ -166,7 +166,7 @@
|
|||
min_gram: 1
|
||||
max_gram: 3
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__edge_ngram }
|
||||
- match: { detail.tokenizer.tokens.0.token: f }
|
||||
- match: { detail.tokenizer.tokens.1.token: fo }
|
||||
- match: { detail.tokenizer.tokens.2.token: foo }
|
||||
|
@ -181,7 +181,7 @@
|
|||
min_gram: 1
|
||||
max_gram: 3
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__edge_ngram }
|
||||
- match: { detail.tokenizer.tokens.0.token: f }
|
||||
- match: { detail.tokenizer.tokens.1.token: fo }
|
||||
- match: { detail.tokenizer.tokens.2.token: foo }
|
||||
|
@ -218,7 +218,7 @@
|
|||
tokenizer:
|
||||
type: classic
|
||||
- length: { detail.tokenizer.tokens: 4 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__classic }
|
||||
- match: { detail.tokenizer.tokens.0.token: Brown }
|
||||
- match: { detail.tokenizer.tokens.1.token: Foxes }
|
||||
- match: { detail.tokenizer.tokens.2.token: don't }
|
||||
|
@ -247,7 +247,7 @@
|
|||
tokenizer:
|
||||
type: letter
|
||||
- length: { detail.tokenizer.tokens: 5 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__letter }
|
||||
- match: { detail.tokenizer.tokens.0.token: Brown }
|
||||
- match: { detail.tokenizer.tokens.1.token: Foxes }
|
||||
- match: { detail.tokenizer.tokens.2.token: don }
|
||||
|
@ -278,7 +278,7 @@
|
|||
tokenizer:
|
||||
type: lowercase
|
||||
- length: { detail.tokenizer.tokens: 5 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__lowercase }
|
||||
- match: { detail.tokenizer.tokens.0.token: brown }
|
||||
- match: { detail.tokenizer.tokens.1.token: foxes }
|
||||
- match: { detail.tokenizer.tokens.2.token: don }
|
||||
|
@ -309,7 +309,7 @@
|
|||
tokenizer:
|
||||
type: path_hierarchy
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__path_hierarchy }
|
||||
- match: { detail.tokenizer.tokens.0.token: a }
|
||||
- match: { detail.tokenizer.tokens.1.token: a/b }
|
||||
- match: { detail.tokenizer.tokens.2.token: a/b/c }
|
||||
|
@ -322,7 +322,7 @@
|
|||
tokenizer:
|
||||
type: PathHierarchy
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__PathHierarchy }
|
||||
- match: { detail.tokenizer.tokens.0.token: a }
|
||||
- match: { detail.tokenizer.tokens.1.token: a/b }
|
||||
- match: { detail.tokenizer.tokens.2.token: a/b/c }
|
||||
|
@ -361,7 +361,7 @@
|
|||
tokenizer:
|
||||
type: pattern
|
||||
- length: { detail.tokenizer.tokens: 5 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__pattern }
|
||||
- match: { detail.tokenizer.tokens.0.token: split }
|
||||
- match: { detail.tokenizer.tokens.1.token: by }
|
||||
- match: { detail.tokenizer.tokens.2.token: whitespace }
|
||||
|
@ -392,7 +392,7 @@
|
|||
tokenizer:
|
||||
type: uax_url_email
|
||||
- length: { detail.tokenizer.tokens: 4 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__uax_url_email }
|
||||
- match: { detail.tokenizer.tokens.0.token: Email }
|
||||
- match: { detail.tokenizer.tokens.1.token: me }
|
||||
- match: { detail.tokenizer.tokens.2.token: at }
|
||||
|
@ -421,7 +421,7 @@
|
|||
tokenizer:
|
||||
type: whitespace
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.name: __anonymous__whitespace }
|
||||
- match: { detail.tokenizer.tokens.0.token: split }
|
||||
- match: { detail.tokenizer.tokens.1.token: by }
|
||||
- match: { detail.tokenizer.tokens.2.token: whitespace }
|
||||
|
|
|
@ -106,7 +106,7 @@
|
|||
|
||||
- length: { detail.tokenizer.tokens: 1 }
|
||||
- length: { detail.tokenfilters.0.tokens: 1 }
|
||||
- match: { detail.tokenizer.name: keyword_for_normalizer }
|
||||
- match: { detail.tokenizer.name: keyword }
|
||||
- match: { detail.tokenizer.tokens.0.token: ABc }
|
||||
- match: { detail.tokenfilters.0.name: lowercase }
|
||||
- match: { detail.tokenfilters.0.tokens.0.token: abc }
|
||||
|
|
|
@ -47,7 +47,7 @@ public class IcuTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private static final String RULE_FILES = "rule_files";
|
||||
|
||||
public IcuTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
config = getIcuConfig(environment, settings);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,7 +45,7 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private boolean discartPunctuation;
|
||||
|
||||
public KuromojiTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
mode = getMode(settings);
|
||||
userDictionary = getUserDictionary(env, settings);
|
||||
discartPunctuation = settings.getAsBoolean("discard_punctuation", true);
|
||||
|
|
|
@ -41,7 +41,7 @@ public class NoriTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private final KoreanTokenizer.DecompoundMode decompoundMode;
|
||||
|
||||
public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
decompoundMode = getMode(settings);
|
||||
userDictionary = getUserDictionary(env, settings);
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
public class SmartChineseTokenizerTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
public SmartChineseTokenizerTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -278,7 +278,6 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeAc
|
|||
CharFilterFactory[] charFilterFactories = components.getCharFilters();
|
||||
TokenizerFactory tokenizerFactory = components.getTokenizerFactory();
|
||||
TokenFilterFactory[] tokenFilterFactories = components.getTokenFilters();
|
||||
String tokenizerName = components.getTokenizerName();
|
||||
|
||||
String[][] charFiltersTexts = new String[charFilterFactories != null ? charFilterFactories.length : 0][request.text().length];
|
||||
TokenListCreator[] tokenFiltersTokenListCreator = new TokenListCreator[tokenFilterFactories != null ?
|
||||
|
@ -338,7 +337,8 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeAc
|
|||
}
|
||||
}
|
||||
detailResponse = new AnalyzeAction.DetailAnalyzeResponse(charFilteredLists,
|
||||
new AnalyzeAction.AnalyzeTokenList(tokenizerName, tokenizerTokenListCreator.getArrayTokens()), tokenFilterLists);
|
||||
new AnalyzeAction.AnalyzeTokenList(tokenizerFactory.name(), tokenizerTokenListCreator.getArrayTokens()),
|
||||
tokenFilterLists);
|
||||
} else {
|
||||
String name;
|
||||
if (analyzer instanceof NamedAnalyzer) {
|
||||
|
|
|
@ -26,13 +26,20 @@ import org.elasticsearch.index.IndexSettings;
|
|||
|
||||
public abstract class AbstractTokenizerFactory extends AbstractIndexComponent implements TokenizerFactory {
|
||||
protected final Version version;
|
||||
private final String name;
|
||||
|
||||
public AbstractTokenizerFactory(IndexSettings indexSettings, Settings settings) {
|
||||
public AbstractTokenizerFactory(IndexSettings indexSettings, Settings settings, String name) {
|
||||
super(indexSettings);
|
||||
this.version = Analysis.parseAnalysisVersion(this.indexSettings.getSettings(), settings, logger);
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public final Version version() {
|
||||
return version;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String name() {
|
||||
return name;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.Version;
|
||||
|
@ -251,11 +252,7 @@ public final class AnalysisRegistry implements Closeable {
|
|||
tokenFilterFactories.add(tff);
|
||||
}
|
||||
|
||||
String tokenizerName = tokenizer.name == null ? "_anonymous_tokenizer" : tokenizer.name;
|
||||
if (normalizer) {
|
||||
tokenizerName = "keyword_for_normalizer";
|
||||
}
|
||||
Analyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizerFactory,
|
||||
Analyzer analyzer = new CustomAnalyzer(tokenizerFactory,
|
||||
charFilterFactories.toArray(new CharFilterFactory[]{}),
|
||||
tokenFilterFactories.toArray(new TokenFilterFactory[]{}));
|
||||
return produceAnalyzer("__custom__", new AnalyzerProvider<Analyzer>() {
|
||||
|
@ -537,10 +534,12 @@ public final class AnalysisRegistry implements Closeable {
|
|||
});
|
||||
}
|
||||
for (Map.Entry<String, AnalyzerProvider<?>> entry : normalizerProviders.entrySet()) {
|
||||
processNormalizerFactory(entry.getKey(), entry.getValue(), normalizers, "keyword",
|
||||
tokenizerFactoryFactories.get("keyword"), tokenFilterFactoryFactories, charFilterFactoryFactories);
|
||||
processNormalizerFactory(entry.getKey(), entry.getValue(), normalizers,
|
||||
TokenizerFactory.newFactory("keyword", KeywordTokenizer::new),
|
||||
tokenFilterFactoryFactories, charFilterFactoryFactories);
|
||||
processNormalizerFactory(entry.getKey(), entry.getValue(), whitespaceNormalizers,
|
||||
"whitespace", () -> new WhitespaceTokenizer(), tokenFilterFactoryFactories, charFilterFactoryFactories);
|
||||
TokenizerFactory.newFactory("whitespace", WhitespaceTokenizer::new),
|
||||
tokenFilterFactoryFactories, charFilterFactoryFactories);
|
||||
}
|
||||
|
||||
if (!analyzers.containsKey(DEFAULT_ANALYZER_NAME)) {
|
||||
|
@ -613,7 +612,6 @@ public final class AnalysisRegistry implements Closeable {
|
|||
String name,
|
||||
AnalyzerProvider<?> normalizerFactory,
|
||||
Map<String, NamedAnalyzer> normalizers,
|
||||
String tokenizerName,
|
||||
TokenizerFactory tokenizerFactory,
|
||||
Map<String, TokenFilterFactory> tokenFilters,
|
||||
Map<String, CharFilterFactory> charFilters) {
|
||||
|
@ -622,7 +620,7 @@ public final class AnalysisRegistry implements Closeable {
|
|||
}
|
||||
|
||||
if (normalizerFactory instanceof CustomNormalizerProvider) {
|
||||
((CustomNormalizerProvider) normalizerFactory).build(tokenizerName, tokenizerFactory, charFilters, tokenFilters);
|
||||
((CustomNormalizerProvider) normalizerFactory).build(tokenizerFactory, charFilters, tokenFilters);
|
||||
}
|
||||
if (normalizers.containsKey(name)) {
|
||||
throw new IllegalStateException("already registered analyzer with name: " + name);
|
||||
|
|
|
@ -30,15 +30,15 @@ import java.util.Map;
|
|||
* See {@link ReloadableCustomAnalyzer} for an example usage.
|
||||
*/
|
||||
public final class AnalyzerComponents {
|
||||
private final String tokenizerName;
|
||||
|
||||
private final TokenizerFactory tokenizerFactory;
|
||||
private final CharFilterFactory[] charFilters;
|
||||
private final TokenFilterFactory[] tokenFilters;
|
||||
private final AnalysisMode analysisMode;
|
||||
|
||||
AnalyzerComponents(String tokenizerName, TokenizerFactory tokenizerFactory, CharFilterFactory[] charFilters,
|
||||
AnalyzerComponents(TokenizerFactory tokenizerFactory, CharFilterFactory[] charFilters,
|
||||
TokenFilterFactory[] tokenFilters) {
|
||||
this.tokenizerName = tokenizerName;
|
||||
|
||||
this.tokenizerFactory = tokenizerFactory;
|
||||
this.charFilters = charFilters;
|
||||
this.tokenFilters = tokenFilters;
|
||||
|
@ -85,14 +85,10 @@ public final class AnalyzerComponents {
|
|||
tokenFilterList.add(tokenFilter);
|
||||
}
|
||||
|
||||
return new AnalyzerComponents(tokenizerName, tokenizer, charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
|
||||
return new AnalyzerComponents(tokenizer, charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
|
||||
tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()]));
|
||||
}
|
||||
|
||||
public String getTokenizerName() {
|
||||
return tokenizerName;
|
||||
}
|
||||
|
||||
public TokenizerFactory getTokenizerFactory() {
|
||||
return tokenizerFactory;
|
||||
}
|
||||
|
@ -108,4 +104,4 @@ public final class AnalyzerComponents {
|
|||
public AnalysisMode analysisMode() {
|
||||
return this.analysisMode;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,14 +32,14 @@ public final class CustomAnalyzer extends Analyzer implements AnalyzerComponents
|
|||
private final int offsetGap;
|
||||
private final AnalysisMode analysisMode;
|
||||
|
||||
public CustomAnalyzer(String tokenizerName, TokenizerFactory tokenizerFactory, CharFilterFactory[] charFilters,
|
||||
public CustomAnalyzer(TokenizerFactory tokenizerFactory, CharFilterFactory[] charFilters,
|
||||
TokenFilterFactory[] tokenFilters) {
|
||||
this(tokenizerName, tokenizerFactory, charFilters, tokenFilters, 0, -1);
|
||||
this(tokenizerFactory, charFilters, tokenFilters, 0, -1);
|
||||
}
|
||||
|
||||
public CustomAnalyzer(String tokenizerName, TokenizerFactory tokenizerFactory, CharFilterFactory[] charFilters,
|
||||
public CustomAnalyzer(TokenizerFactory tokenizerFactory, CharFilterFactory[] charFilters,
|
||||
TokenFilterFactory[] tokenFilters, int positionIncrementGap, int offsetGap) {
|
||||
this.components = new AnalyzerComponents(tokenizerName, tokenizerFactory, charFilters, tokenFilters);
|
||||
this.components = new AnalyzerComponents(tokenizerFactory, charFilters, tokenFilters);
|
||||
this.positionIncrementGap = positionIncrementGap;
|
||||
this.offsetGap = offsetGap;
|
||||
// merge and transfer token filter analysis modes with analyzer
|
||||
|
@ -50,13 +50,6 @@ public final class CustomAnalyzer extends Analyzer implements AnalyzerComponents
|
|||
this.analysisMode = mode;
|
||||
}
|
||||
|
||||
/**
|
||||
* The name of the tokenizer as configured by the user.
|
||||
*/
|
||||
public String getTokenizerName() {
|
||||
return this.components.getTokenizerName();
|
||||
}
|
||||
|
||||
public TokenizerFactory tokenizerFactory() {
|
||||
return this.components.getTokenizerFactory();
|
||||
}
|
||||
|
|
|
@ -64,7 +64,7 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyz
|
|||
if (components.analysisMode().equals(AnalysisMode.SEARCH_TIME)) {
|
||||
return new ReloadableCustomAnalyzer(components, positionIncrementGap, offsetGap);
|
||||
} else {
|
||||
return new CustomAnalyzer(components.getTokenizerName(), components.getTokenizerFactory(), components.getCharFilters(),
|
||||
return new CustomAnalyzer(components.getTokenizerFactory(), components.getCharFilters(),
|
||||
components.getTokenFilters(), positionIncrementGap, offsetGap);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,7 +43,7 @@ public final class CustomNormalizerProvider extends AbstractIndexAnalyzerProvide
|
|||
this.analyzerSettings = settings;
|
||||
}
|
||||
|
||||
public void build(final String tokenizerName, final TokenizerFactory tokenizerFactory, final Map<String, CharFilterFactory> charFilters,
|
||||
public void build(final TokenizerFactory tokenizerFactory, final Map<String, CharFilterFactory> charFilters,
|
||||
final Map<String, TokenFilterFactory> tokenFilters) {
|
||||
if (analyzerSettings.get("tokenizer") != null) {
|
||||
throw new IllegalArgumentException("Custom normalizer [" + name() + "] cannot configure a tokenizer");
|
||||
|
@ -79,7 +79,6 @@ public final class CustomNormalizerProvider extends AbstractIndexAnalyzerProvide
|
|||
}
|
||||
|
||||
this.customAnalyzer = new CustomAnalyzer(
|
||||
tokenizerName,
|
||||
tokenizerFactory,
|
||||
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
|
||||
tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()])
|
||||
|
|
|
@ -32,7 +32,7 @@ import java.io.IOException;
|
|||
* Shared implementation for pre-configured analysis components.
|
||||
*/
|
||||
public abstract class PreConfiguredAnalysisComponent<T> implements AnalysisModule.AnalysisProvider<T> {
|
||||
private final String name;
|
||||
protected final String name;
|
||||
protected final PreBuiltCacheFactory.PreBuiltCache<T> cache;
|
||||
|
||||
protected PreConfiguredAnalysisComponent(String name, PreBuiltCacheFactory.CachingStrategy cache) {
|
||||
|
|
|
@ -70,6 +70,6 @@ public final class PreConfiguredTokenizer extends PreConfiguredAnalysisComponent
|
|||
|
||||
@Override
|
||||
protected TokenizerFactory create(Version version) {
|
||||
return () -> create.apply(version);
|
||||
return TokenizerFactory.newFactory(name, () -> create.apply(version));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,7 +31,7 @@ public class StandardTokenizerFactory extends AbstractTokenizerFactory {
|
|||
private final int maxTokenLength;
|
||||
|
||||
public StandardTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,25 @@ package org.elasticsearch.index.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
import java.util.function.Supplier;
|
||||
|
||||
public interface TokenizerFactory {
|
||||
|
||||
String name();
|
||||
|
||||
Tokenizer create();
|
||||
|
||||
static TokenizerFactory newFactory(String name, Supplier<Tokenizer> supplier) {
|
||||
return new TokenizerFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create() {
|
||||
return supplier.get();
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
@ -132,7 +132,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
@Override
|
||||
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
|
||||
return singletonMap("keyword", (indexSettings, environment, name, settings) ->
|
||||
() -> new MockTokenizer(MockTokenizer.KEYWORD, false));
|
||||
TokenizerFactory.newFactory(name, () -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -120,7 +120,7 @@ public class AnalysisRegistryTests extends ESTestCase {
|
|||
return null;
|
||||
}
|
||||
};
|
||||
Analyzer analyzer = new CustomAnalyzer("tokenizerName", null, new CharFilterFactory[0], new TokenFilterFactory[] { tokenFilter });
|
||||
Analyzer analyzer = new CustomAnalyzer(null, new CharFilterFactory[0], new TokenFilterFactory[] { tokenFilter });
|
||||
MapperException ex = expectThrows(MapperException.class,
|
||||
() -> emptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings),
|
||||
singletonMap("default", new PreBuiltAnalyzerProvider("default", AnalyzerScope.INDEX, analyzer)), emptyMap(),
|
||||
|
|
|
@ -170,7 +170,7 @@ public class CustomNormalizerTests extends ESTokenStreamTestCase {
|
|||
@Override
|
||||
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
|
||||
return singletonMap("keyword", (indexSettings, environment, name, settings) ->
|
||||
() -> new MockTokenizer(MockTokenizer.KEYWORD, false));
|
||||
TokenizerFactory.newFactory(name, () -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -73,7 +73,7 @@ public class NamedAnalyzerTests extends ESTestCase {
|
|||
return mode;
|
||||
}
|
||||
};
|
||||
return new CustomAnalyzer("tokenizerName", null, new CharFilterFactory[0],
|
||||
return new CustomAnalyzer(null, new CharFilterFactory[0],
|
||||
new TokenFilterFactory[] { tokenFilter });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -94,7 +94,7 @@ public class ReloadableCustomAnalyzerTests extends ESTestCase {
|
|||
try (ReloadableCustomAnalyzer analyzer = new ReloadableCustomAnalyzer(components, positionIncrementGap, offsetGap)) {
|
||||
assertEquals(positionIncrementGap, analyzer.getPositionIncrementGap(randomAlphaOfLength(5)));
|
||||
assertEquals(offsetGap >= 0 ? offsetGap : 1, analyzer.getOffsetGap(randomAlphaOfLength(5)));
|
||||
assertEquals("standard", analyzer.getComponents().getTokenizerName());
|
||||
assertEquals("standard", analyzer.getComponents().getTokenizerFactory().name());
|
||||
assertEquals(0, analyzer.getComponents().getCharFilters().length);
|
||||
assertSame(testAnalysis.tokenizer.get("standard"), analyzer.getComponents().getTokenizerFactory());
|
||||
assertEquals(1, analyzer.getComponents().getTokenFilters().length);
|
||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.index.mapper;
|
|||
|
||||
import org.apache.lucene.analysis.MockLowerCaseFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.index.DocValuesType;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
|
@ -69,16 +68,8 @@ public class KeywordFieldMapperTests extends ESSingleNodeTestCase {
|
|||
|
||||
@Override
|
||||
public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
|
||||
return singletonMap("keyword", (indexSettings, environment, name, settings) -> {
|
||||
class Factory implements TokenizerFactory {
|
||||
|
||||
@Override
|
||||
public Tokenizer create() {
|
||||
return new MockTokenizer(MockTokenizer.KEYWORD, false);
|
||||
}
|
||||
}
|
||||
return new Factory();
|
||||
});
|
||||
return singletonMap("keyword", (indexSettings, environment, name, settings) ->
|
||||
TokenizerFactory.newFactory(name, () -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -206,7 +206,7 @@ public class TypeParsersTests extends ESTestCase {
|
|||
return null;
|
||||
}
|
||||
};
|
||||
return new CustomAnalyzer("tokenizerName", null, new CharFilterFactory[0],
|
||||
return new CustomAnalyzer(null, new CharFilterFactory[0],
|
||||
new TokenFilterFactory[] { tokenFilter });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -281,8 +281,8 @@ public class AnalysisModuleTests extends ESTestCase {
|
|||
@Override
|
||||
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
|
||||
// Need mock keyword tokenizer here, because alpha / beta versions are broken up by the dash.
|
||||
return singletonMap("keyword", (indexSettings, environment, name, settings) ->
|
||||
() -> new MockTokenizer(MockTokenizer.KEYWORD, false));
|
||||
return singletonMap("keyword", (indexSettings, environment, name, settings)
|
||||
-> TokenizerFactory.newFactory(name, () -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
|
||||
}
|
||||
})).getAnalysisRegistry();
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
package org.elasticsearch.test;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
|
@ -40,15 +39,7 @@ public class MockKeywordPlugin extends Plugin implements AnalysisPlugin {
|
|||
|
||||
@Override
|
||||
public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
|
||||
return singletonMap("keyword", (indexSettings, environment, name, settings) -> {
|
||||
class Factory implements TokenizerFactory {
|
||||
|
||||
@Override
|
||||
public Tokenizer create() {
|
||||
return new MockTokenizer(MockTokenizer.KEYWORD, false);
|
||||
}
|
||||
}
|
||||
return new Factory();
|
||||
});
|
||||
return singletonMap("keyword", (indexSettings, environment, name, settings) ->
|
||||
TokenizerFactory.newFactory(name, () -> new MockTokenizer(MockTokenizer.KEYWORD, false)));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,7 +20,7 @@ import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
|||
public class MlClassicTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
public MlClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, settings);
|
||||
super(indexSettings, settings, name);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue