Move construction of custom analyzers into AnalysisRegistry (#42940)
Both TransportAnalyzeAction and CategorizationAnalyzer have logic to build custom analyzers for index-independent analysis. A lot of this code is duplicated, and it requires the AnalysisRegistry to expose a number of internal provider classes, as well as making some assumptions about when analysis components are constructed. This commit moves the build logic directly into AnalysisRegistry, reducing the registry's API surface considerably.
This commit is contained in:
parent
81a3b6e2fe
commit
8e23e4518a
|
@ -72,5 +72,4 @@
|
|||
- match: { detail.tokenizer.tokens.0.token: foo }
|
||||
- match: { detail.tokenizer.tokens.1.token: bar }
|
||||
- match: { detail.tokenizer.tokens.2.token: buzz }
|
||||
- match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter" }
|
||||
- match: { detail.tokenfilters.0.tokens.0.token: bar }
|
||||
|
|
|
@ -28,15 +28,12 @@ import org.elasticsearch.common.Strings;
|
|||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.common.io.stream.Writeable;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.ObjectParser;
|
||||
import org.elasticsearch.common.xcontent.ToXContentFragment;
|
||||
import org.elasticsearch.common.xcontent.ToXContentObject;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.common.xcontent.XContentParseException;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.common.xcontent.XContentType;
|
||||
import org.elasticsearch.index.analysis.NameOrDefinition;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
|
@ -83,60 +80,6 @@ public class AnalyzeAction extends Action<AnalyzeAction.Response> {
|
|||
private String[] attributes = Strings.EMPTY_ARRAY;
|
||||
private String normalizer;
|
||||
|
||||
public static class NameOrDefinition implements Writeable {
|
||||
// exactly one of these two members is not null
|
||||
public final String name;
|
||||
public final Settings definition;
|
||||
|
||||
NameOrDefinition(String name) {
|
||||
this.name = Objects.requireNonNull(name);
|
||||
this.definition = null;
|
||||
}
|
||||
|
||||
NameOrDefinition(Map<String, ?> definition) {
|
||||
this.name = null;
|
||||
Objects.requireNonNull(definition);
|
||||
try {
|
||||
XContentBuilder builder = XContentFactory.contentBuilder(XContentType.JSON);
|
||||
builder.map(definition);
|
||||
this.definition = Settings.builder().loadFromSource(Strings.toString(builder), builder.contentType()).build();
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException("Failed to parse [" + definition + "]", e);
|
||||
}
|
||||
}
|
||||
|
||||
NameOrDefinition(StreamInput in) throws IOException {
|
||||
name = in.readOptionalString();
|
||||
if (in.readBoolean()) {
|
||||
definition = Settings.readSettingsFromStream(in);
|
||||
} else {
|
||||
definition = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeOptionalString(name);
|
||||
boolean isNotNullDefinition = this.definition != null;
|
||||
out.writeBoolean(isNotNullDefinition);
|
||||
if (isNotNullDefinition) {
|
||||
Settings.writeSettingsToStream(definition, out);
|
||||
}
|
||||
}
|
||||
|
||||
public static NameOrDefinition fromXContent(XContentParser parser) throws IOException {
|
||||
if (parser.currentToken() == XContentParser.Token.VALUE_STRING) {
|
||||
return new NameOrDefinition(parser.text());
|
||||
}
|
||||
if (parser.currentToken() == XContentParser.Token.START_OBJECT) {
|
||||
return new NameOrDefinition(parser.map());
|
||||
}
|
||||
throw new XContentParseException(parser.getTokenLocation(),
|
||||
"Expected [VALUE_STRING] or [START_OBJECT], got " + parser.currentToken());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public Request() {
|
||||
}
|
||||
|
||||
|
|
|
@ -28,45 +28,37 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.action.support.ActionFilters;
|
||||
import org.elasticsearch.action.support.single.shard.TransportSingleShardAction;
|
||||
import org.elasticsearch.cluster.ClusterState;
|
||||
import org.elasticsearch.cluster.block.ClusterBlockException;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
|
||||
import org.elasticsearch.cluster.routing.ShardsIterator;
|
||||
import org.elasticsearch.cluster.service.ClusterService;
|
||||
import org.elasticsearch.common.UUIDs;
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.io.stream.Writeable;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.core.internal.io.IOUtils;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexService;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||
import org.elasticsearch.index.analysis.NameOrDefinition;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.analysis.NormalizingCharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
import org.elasticsearch.index.mapper.KeywordFieldMapper;
|
||||
import org.elasticsearch.index.mapper.MappedFieldType;
|
||||
import org.elasticsearch.index.shard.ShardId;
|
||||
import org.elasticsearch.indices.IndicesService;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
import org.elasticsearch.threadpool.ThreadPool;
|
||||
import org.elasticsearch.transport.TransportService;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
@ -74,7 +66,6 @@ import java.util.Locale;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.function.Function;
|
||||
|
||||
/**
|
||||
* Transport action used to execute analyze requests
|
||||
|
@ -83,17 +74,15 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeAc
|
|||
|
||||
private final Settings settings;
|
||||
private final IndicesService indicesService;
|
||||
private final Environment environment;
|
||||
|
||||
@Inject
|
||||
public TransportAnalyzeAction(Settings settings, ThreadPool threadPool, ClusterService clusterService,
|
||||
TransportService transportService, IndicesService indicesService, ActionFilters actionFilters,
|
||||
IndexNameExpressionResolver indexNameExpressionResolver, Environment environment) {
|
||||
IndexNameExpressionResolver indexNameExpressionResolver) {
|
||||
super(AnalyzeAction.NAME, threadPool, clusterService, transportService, actionFilters, indexNameExpressionResolver,
|
||||
AnalyzeAction.Request::new, ThreadPool.Names.ANALYZE);
|
||||
this.settings = settings;
|
||||
this.indicesService = indicesService;
|
||||
this.environment = environment;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -129,17 +118,17 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeAc
|
|||
final int maxTokenCount = indexService == null ?
|
||||
IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings) : indexService.getIndexSettings().getMaxTokenCount();
|
||||
|
||||
return analyze(request, indicesService.getAnalysis(), environment, indexService, maxTokenCount);
|
||||
return analyze(request, indicesService.getAnalysis(), indexService, maxTokenCount);
|
||||
}
|
||||
|
||||
public static AnalyzeAction.Response analyze(AnalyzeAction.Request request, AnalysisRegistry analysisRegistry,
|
||||
Environment environment, IndexService indexService, int maxTokenCount) throws IOException {
|
||||
IndexService indexService, int maxTokenCount) throws IOException {
|
||||
|
||||
IndexAnalyzers indexAnalyzers = indexService == null ? null : indexService.getIndexAnalyzers();
|
||||
|
||||
// First, we check to see if the request requires a custom analyzer. If so, then we
|
||||
// need to build it and then close it after use.
|
||||
try (Analyzer analyzer = buildCustomAnalyzer(request, analysisRegistry, indexAnalyzers, environment)) {
|
||||
try (Analyzer analyzer = buildCustomAnalyzer(request, analysisRegistry, indexAnalyzers)) {
|
||||
if (analyzer != null) {
|
||||
return analyze(request, analyzer, maxTokenCount);
|
||||
}
|
||||
|
@ -205,38 +194,15 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeAc
|
|||
}
|
||||
|
||||
private static Analyzer buildCustomAnalyzer(AnalyzeAction.Request request, AnalysisRegistry analysisRegistry,
|
||||
IndexAnalyzers indexAnalyzers, Environment environment) throws IOException {
|
||||
IndexAnalyzers indexAnalyzers) throws IOException {
|
||||
final IndexSettings indexSettings = indexAnalyzers == null ? null : indexAnalyzers.getIndexSettings();
|
||||
if (request.tokenizer() != null) {
|
||||
final IndexSettings indexSettings = indexAnalyzers == null ? null : indexAnalyzers.getIndexSettings();
|
||||
Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(request, indexAnalyzers,
|
||||
analysisRegistry, environment);
|
||||
|
||||
List<CharFilterFactory> charFilterFactoryList =
|
||||
parseCharFilterFactories(request, indexSettings, analysisRegistry, environment, false);
|
||||
|
||||
List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(request, indexSettings, analysisRegistry,
|
||||
environment, tokenizerFactory, charFilterFactoryList, false);
|
||||
|
||||
return new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
|
||||
charFilterFactoryList.toArray(new CharFilterFactory[0]),
|
||||
tokenFilterFactoryList.toArray(new TokenFilterFactory[0]));
|
||||
return analysisRegistry.buildCustomAnalyzer(indexSettings, false,
|
||||
request.tokenizer(), request.charFilters(), request.tokenFilters());
|
||||
} else if (((request.tokenFilters() != null && request.tokenFilters().size() > 0)
|
||||
|| (request.charFilters() != null && request.charFilters().size() > 0))) {
|
||||
final IndexSettings indexSettings = indexAnalyzers == null ? null : indexAnalyzers.getIndexSettings();
|
||||
// custom normalizer = if normalizer == null but filter or char_filter is not null and tokenizer/analyzer is null
|
||||
// get charfilter and filter from request
|
||||
List<CharFilterFactory> charFilterFactoryList =
|
||||
parseCharFilterFactories(request, indexSettings, analysisRegistry, environment, true);
|
||||
|
||||
final String keywordTokenizerName = "keyword";
|
||||
TokenizerFactory keywordTokenizerFactory = getTokenizerFactory(analysisRegistry, environment, keywordTokenizerName);
|
||||
|
||||
List<TokenFilterFactory> tokenFilterFactoryList =
|
||||
parseTokenFilterFactories(request, indexSettings, analysisRegistry, environment,
|
||||
new Tuple<>(keywordTokenizerName, keywordTokenizerFactory), charFilterFactoryList, true);
|
||||
|
||||
return new CustomAnalyzer("keyword_for_normalizer", keywordTokenizerFactory,
|
||||
charFilterFactoryList.toArray(new CharFilterFactory[0]), tokenFilterFactoryList.toArray(new TokenFilterFactory[0]));
|
||||
return analysisRegistry.buildCustomAnalyzer(indexSettings, true, new NameOrDefinition("keyword"),
|
||||
request.charFilters(), request.tokenFilters());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
@ -525,228 +491,4 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeAc
|
|||
return extendedAttributes;
|
||||
}
|
||||
|
||||
private static List<CharFilterFactory> parseCharFilterFactories(AnalyzeAction.Request request, IndexSettings indexSettings,
|
||||
AnalysisRegistry analysisRegistry, Environment environment,
|
||||
boolean normalizer) throws IOException {
|
||||
List<CharFilterFactory> charFilterFactoryList = new ArrayList<>();
|
||||
if (request.charFilters() != null && request.charFilters().size() > 0) {
|
||||
List<AnalyzeAction.Request.NameOrDefinition> charFilters = request.charFilters();
|
||||
for (AnalyzeAction.Request.NameOrDefinition charFilter : charFilters) {
|
||||
CharFilterFactory charFilterFactory;
|
||||
// parse anonymous settings
|
||||
if (charFilter.definition != null) {
|
||||
Settings settings = getAnonymousSettings(charFilter.definition);
|
||||
String charFilterTypeName = settings.get("type");
|
||||
if (charFilterTypeName == null) {
|
||||
throw new IllegalArgumentException("Missing [type] setting for anonymous char filter: " + charFilter.definition);
|
||||
}
|
||||
AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
|
||||
analysisRegistry.getCharFilterProvider(charFilterTypeName);
|
||||
if (charFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find global char filter under [" + charFilterTypeName + "]");
|
||||
}
|
||||
// Need to set anonymous "name" of char_filter
|
||||
charFilterFactory = charFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_charfilter",
|
||||
settings);
|
||||
} else {
|
||||
AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory;
|
||||
if (indexSettings == null) {
|
||||
charFilterFactoryFactory = analysisRegistry.getCharFilterProvider(charFilter.name);
|
||||
if (charFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find global char filter under [" + charFilter.name + "]");
|
||||
}
|
||||
charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name);
|
||||
} else {
|
||||
charFilterFactoryFactory = analysisRegistry.getCharFilterProvider(charFilter.name, indexSettings);
|
||||
if (charFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find char filter under [" + charFilter.name + "]");
|
||||
}
|
||||
charFilterFactory = charFilterFactoryFactory.get(indexSettings, environment, charFilter.name,
|
||||
AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
|
||||
AnalysisRegistry.INDEX_ANALYSIS_CHAR_FILTER + "." + charFilter.name));
|
||||
}
|
||||
}
|
||||
if (charFilterFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find char filter under [" + charFilter.name + "]");
|
||||
}
|
||||
if (normalizer) {
|
||||
if (charFilterFactory instanceof NormalizingCharFilterFactory == false) {
|
||||
throw new IllegalArgumentException("Custom normalizer may not use char filter ["
|
||||
+ charFilterFactory.name() + "]");
|
||||
}
|
||||
}
|
||||
charFilterFactoryList.add(charFilterFactory);
|
||||
}
|
||||
}
|
||||
return charFilterFactoryList;
|
||||
}
|
||||
|
||||
public static class DeferredTokenFilterRegistry implements Function<String, TokenFilterFactory> {
|
||||
|
||||
private final AnalysisRegistry analysisRegistry;
|
||||
private final IndexSettings indexSettings;
|
||||
Map<String, TokenFilterFactory> prebuiltFilters;
|
||||
|
||||
public DeferredTokenFilterRegistry(AnalysisRegistry analysisRegistry, IndexSettings indexSettings) {
|
||||
this.analysisRegistry = analysisRegistry;
|
||||
if (indexSettings == null) {
|
||||
// Settings are null when _analyze is called with no index name, so
|
||||
// we create dummy settings which will make prebuilt analysis components
|
||||
// available
|
||||
Settings settings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
|
||||
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
|
||||
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
|
||||
.build();
|
||||
IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
|
||||
indexSettings = new IndexSettings(metaData, Settings.EMPTY);
|
||||
}
|
||||
this.indexSettings = indexSettings;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenFilterFactory apply(String s) {
|
||||
if (prebuiltFilters == null) {
|
||||
try {
|
||||
prebuiltFilters = analysisRegistry.buildTokenFilterFactories(indexSettings);
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
return prebuiltFilters.get(s);
|
||||
}
|
||||
}
|
||||
|
||||
private static List<TokenFilterFactory> parseTokenFilterFactories(AnalyzeAction.Request request, IndexSettings indexSettings,
|
||||
AnalysisRegistry analysisRegistry, Environment environment,
|
||||
Tuple<String, TokenizerFactory> tokenizerFactory,
|
||||
List<CharFilterFactory> charFilterFactoryList,
|
||||
boolean normalizer) throws IOException {
|
||||
List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
|
||||
DeferredTokenFilterRegistry deferredRegistry = new DeferredTokenFilterRegistry(analysisRegistry, indexSettings);
|
||||
if (request.tokenFilters() != null && request.tokenFilters().size() > 0) {
|
||||
List<AnalyzeAction.Request.NameOrDefinition> tokenFilters = request.tokenFilters();
|
||||
for (AnalyzeAction.Request.NameOrDefinition tokenFilter : tokenFilters) {
|
||||
TokenFilterFactory tokenFilterFactory;
|
||||
// parse anonymous settings
|
||||
if (tokenFilter.definition != null) {
|
||||
Settings settings = getAnonymousSettings(tokenFilter.definition);
|
||||
String filterTypeName = settings.get("type");
|
||||
if (filterTypeName == null) {
|
||||
throw new IllegalArgumentException("Missing [type] setting for anonymous token filter: " + tokenFilter.definition);
|
||||
}
|
||||
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory =
|
||||
analysisRegistry.getTokenFilterProvider(filterTypeName);
|
||||
if (tokenFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find global token filter under [" + filterTypeName + "]");
|
||||
}
|
||||
// Need to set anonymous "name" of tokenfilter
|
||||
tokenFilterFactory = tokenFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenfilter",
|
||||
settings);
|
||||
tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(), charFilterFactoryList,
|
||||
tokenFilterFactoryList, deferredRegistry);
|
||||
|
||||
} else {
|
||||
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
|
||||
if (indexSettings == null) {
|
||||
tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name);
|
||||
if (tokenFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find global token filter under [" + tokenFilter.name + "]");
|
||||
}
|
||||
tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
|
||||
} else {
|
||||
tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name, indexSettings);
|
||||
if (tokenFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find token filter under [" + tokenFilter.name + "]");
|
||||
}
|
||||
Settings settings = AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
|
||||
AnalysisRegistry.INDEX_ANALYSIS_FILTER + "." + tokenFilter.name);
|
||||
tokenFilterFactory = tokenFilterFactoryFactory.get(indexSettings, environment, tokenFilter.name, settings);
|
||||
tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(),
|
||||
charFilterFactoryList, tokenFilterFactoryList, deferredRegistry);
|
||||
}
|
||||
}
|
||||
if (tokenFilterFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find or create token filter under [" + tokenFilter.name + "]");
|
||||
}
|
||||
if (normalizer) {
|
||||
if (tokenFilterFactory instanceof NormalizingTokenFilterFactory == false) {
|
||||
throw new IllegalArgumentException("Custom normalizer may not use filter ["
|
||||
+ tokenFilterFactory.name() + "]");
|
||||
}
|
||||
}
|
||||
tokenFilterFactoryList.add(tokenFilterFactory);
|
||||
}
|
||||
}
|
||||
return tokenFilterFactoryList;
|
||||
}
|
||||
|
||||
private static Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalyzeAction.Request request, IndexAnalyzers indexAnalzyers,
|
||||
AnalysisRegistry analysisRegistry,
|
||||
Environment environment) throws IOException {
|
||||
String name;
|
||||
TokenizerFactory tokenizerFactory;
|
||||
final AnalyzeAction.Request.NameOrDefinition tokenizer = request.tokenizer();
|
||||
// parse anonymous settings
|
||||
if (tokenizer.definition != null) {
|
||||
Settings settings = getAnonymousSettings(tokenizer.definition);
|
||||
String tokenizerTypeName = settings.get("type");
|
||||
if (tokenizerTypeName == null) {
|
||||
throw new IllegalArgumentException("Missing [type] setting for anonymous tokenizer: " + tokenizer.definition);
|
||||
}
|
||||
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
|
||||
analysisRegistry.getTokenizerProvider(tokenizerTypeName);
|
||||
if (tokenizerFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find global tokenizer under [" + tokenizerTypeName + "]");
|
||||
}
|
||||
// Need to set anonymous "name" of tokenizer
|
||||
name = "_anonymous_tokenizer";
|
||||
tokenizerFactory = tokenizerFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenizer", settings);
|
||||
} else {
|
||||
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory;
|
||||
if (indexAnalzyers == null) {
|
||||
tokenizerFactory = getTokenizerFactory(analysisRegistry, environment, tokenizer.name);
|
||||
name = tokenizer.name;
|
||||
} else {
|
||||
tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(tokenizer.name, indexAnalzyers.getIndexSettings());
|
||||
if (tokenizerFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find tokenizer under [" + tokenizer.name + "]");
|
||||
}
|
||||
name = tokenizer.name;
|
||||
tokenizerFactory = tokenizerFactoryFactory.get(indexAnalzyers.getIndexSettings(), environment, tokenizer.name,
|
||||
AnalysisRegistry.getSettingsFromIndexSettings(indexAnalzyers.getIndexSettings(),
|
||||
AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizer.name));
|
||||
}
|
||||
}
|
||||
return new Tuple<>(name, tokenizerFactory);
|
||||
}
|
||||
|
||||
private static TokenizerFactory getTokenizerFactory(AnalysisRegistry analysisRegistry, Environment environment,
|
||||
String name) throws IOException {
|
||||
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory;
|
||||
TokenizerFactory tokenizerFactory;
|
||||
tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name);
|
||||
if (tokenizerFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("failed to find global tokenizer under [" + name + "]");
|
||||
}
|
||||
tokenizerFactory = tokenizerFactoryFactory.get(environment, name);
|
||||
return tokenizerFactory;
|
||||
}
|
||||
|
||||
private static IndexSettings getNaIndexSettings(Settings settings) {
|
||||
IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
|
||||
return new IndexSettings(metaData, Settings.EMPTY);
|
||||
}
|
||||
|
||||
private static Settings getAnonymousSettings(Settings providerSetting) {
|
||||
return Settings.builder().put(providerSetting)
|
||||
// for _na_
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
|
||||
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
|
||||
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -34,11 +34,15 @@ import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
|
|||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
@ -87,13 +91,7 @@ public final class AnalysisRegistry implements Closeable {
|
|||
new PrebuiltAnalysis(preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers, preConfiguredAnalyzers);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@link Settings} by groupName from {@link IndexSettings} or a default {@link Settings}
|
||||
* @param indexSettings an index settings
|
||||
* @param groupName tokenizer/token filter/char filter name
|
||||
* @return {@link Settings}
|
||||
*/
|
||||
public static Settings getSettingsFromIndexSettings(IndexSettings indexSettings, String groupName) {
|
||||
private static Settings getSettingsFromIndexSettings(IndexSettings indexSettings, String groupName) {
|
||||
Settings settings = indexSettings.getSettings().getAsSettings(groupName);
|
||||
if (settings.isEmpty()) {
|
||||
settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, indexSettings.getIndexVersionCreated()).build();
|
||||
|
@ -101,24 +99,70 @@ public final class AnalysisRegistry implements Closeable {
|
|||
return settings;
|
||||
}
|
||||
|
||||
private static final IndexSettings NO_INDEX_SETTINGS = new IndexSettings(
|
||||
IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE)
|
||||
.settings(Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT))
|
||||
.numberOfReplicas(0)
|
||||
.numberOfShards(1)
|
||||
.build(),
|
||||
Settings.EMPTY
|
||||
);
|
||||
|
||||
private <T> T getComponentFactory(IndexSettings settings, NameOrDefinition nod,
|
||||
String componentType,
|
||||
Function<String, AnalysisProvider<T>> globalComponentProvider,
|
||||
BiFunction<String, IndexSettings, AnalysisProvider<T>> indexComponentProvider) throws IOException {
|
||||
if (nod.definition != null) {
|
||||
// custom component, so we build it from scratch
|
||||
String type = nod.definition.get("type");
|
||||
if (type == null) {
|
||||
throw new IllegalArgumentException("Missing [type] setting for anonymous " + componentType + ": " + nod.definition);
|
||||
}
|
||||
AnalysisProvider<T> factory = globalComponentProvider.apply(type);
|
||||
if (factory == null) {
|
||||
throw new IllegalArgumentException("failed to find global " + componentType + " under [" + type + "]");
|
||||
}
|
||||
if (settings == null) {
|
||||
settings = NO_INDEX_SETTINGS;
|
||||
}
|
||||
return factory.get(settings, environment, "__anonymous__" + type, nod.definition);
|
||||
}
|
||||
if (settings == null) {
|
||||
// no index provided, so we use global analysis components only
|
||||
AnalysisProvider<T> factory = globalComponentProvider.apply(nod.name);
|
||||
if (factory == null) {
|
||||
throw new IllegalArgumentException("failed to find global " + componentType + " under [" + nod.name + "]");
|
||||
}
|
||||
return factory.get(environment, nod.name);
|
||||
} else {
|
||||
// get the component from index settings
|
||||
AnalysisProvider<T> factory = indexComponentProvider.apply(nod.name, settings);
|
||||
if (factory == null) {
|
||||
throw new IllegalArgumentException("failed to find " + componentType + " under [" + nod.name + "]");
|
||||
}
|
||||
Settings s = getSettingsFromIndexSettings(settings, "index.analysis." + componentType + "." + nod.name);
|
||||
return factory.get(settings, environment, nod.name, s);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a registered {@link TokenizerFactory} provider by name or <code>null</code> if the tokenizer was not registered
|
||||
*/
|
||||
public AnalysisModule.AnalysisProvider<TokenizerFactory> getTokenizerProvider(String tokenizer) {
|
||||
private AnalysisModule.AnalysisProvider<TokenizerFactory> getTokenizerProvider(String tokenizer) {
|
||||
return tokenizers.getOrDefault(tokenizer, this.prebuiltAnalysis.getTokenizerFactory(tokenizer));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a registered {@link TokenFilterFactory} provider by name or <code>null</code> if the token filter was not registered
|
||||
*/
|
||||
public AnalysisModule.AnalysisProvider<TokenFilterFactory> getTokenFilterProvider(String tokenFilter) {
|
||||
private AnalysisModule.AnalysisProvider<TokenFilterFactory> getTokenFilterProvider(String tokenFilter) {
|
||||
return tokenFilters.getOrDefault(tokenFilter, this.prebuiltAnalysis.getTokenFilterFactory(tokenFilter));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a registered {@link CharFilterFactory} provider by name or <code>null</code> if the char filter was not registered
|
||||
*/
|
||||
public AnalysisModule.AnalysisProvider<CharFilterFactory> getCharFilterProvider(String charFilter) {
|
||||
private AnalysisModule.AnalysisProvider<CharFilterFactory> getCharFilterProvider(String charFilter) {
|
||||
return charFilters.getOrDefault(charFilter, this.prebuiltAnalysis.getCharFilterFactory(charFilter));
|
||||
}
|
||||
|
||||
|
@ -167,6 +211,66 @@ public final class AnalysisRegistry implements Closeable {
|
|||
return build(indexSettings, analyzerFactories, normalizerFactories, tokenizerFactories, charFilterFactories, tokenFilterFactories);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a custom analyzer from a collection of {@link NameOrDefinition} specifications for each component
|
||||
*
|
||||
* Callers are responsible for closing the returned Analyzer
|
||||
*/
|
||||
public NamedAnalyzer buildCustomAnalyzer(IndexSettings indexSettings, boolean normalizer, NameOrDefinition tokenizer,
|
||||
List<NameOrDefinition> charFilters, List<NameOrDefinition> tokenFilters) throws IOException {
|
||||
TokenizerFactory tokenizerFactory
|
||||
= getComponentFactory(indexSettings, tokenizer, "tokenizer", this::getTokenizerProvider, this::getTokenizerProvider);
|
||||
|
||||
List<CharFilterFactory> charFilterFactories = new ArrayList<>();
|
||||
for (NameOrDefinition nod : charFilters) {
|
||||
charFilterFactories.add(getComponentFactory(indexSettings, nod, "char_filter",
|
||||
this::getCharFilterProvider, this::getCharFilterProvider));
|
||||
}
|
||||
|
||||
List<TokenFilterFactory> tokenFilterFactories = new ArrayList<>();
|
||||
for (NameOrDefinition nod : tokenFilters) {
|
||||
TokenFilterFactory tff = getComponentFactory(indexSettings, nod, "filter",
|
||||
this::getTokenFilterProvider, this::getTokenFilterProvider);
|
||||
if (normalizer && tff instanceof NormalizingTokenFilterFactory == false) {
|
||||
throw new IllegalArgumentException("Custom normalizer may not use filter [" + tff.name() + "]");
|
||||
}
|
||||
tff = tff.getChainAwareTokenFilterFactory(tokenizerFactory, charFilterFactories, tokenFilterFactories, name -> {
|
||||
try {
|
||||
return getComponentFactory(indexSettings, new NameOrDefinition(name), "filter",
|
||||
this::getTokenFilterProvider, this::getTokenFilterProvider);
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
});
|
||||
tokenFilterFactories.add(tff);
|
||||
}
|
||||
|
||||
String tokenizerName = tokenizer.name == null ? "_anonymous_tokenizer" : tokenizer.name;
|
||||
if (normalizer) {
|
||||
tokenizerName = "keyword_for_normalizer";
|
||||
}
|
||||
Analyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizerFactory,
|
||||
charFilterFactories.toArray(new CharFilterFactory[]{}),
|
||||
tokenFilterFactories.toArray(new TokenFilterFactory[]{}));
|
||||
return produceAnalyzer("__custom__", new AnalyzerProvider<Analyzer>() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "__custom__";
|
||||
}
|
||||
|
||||
@Override
|
||||
public AnalyzerScope scope() {
|
||||
return AnalyzerScope.GLOBAL;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Analyzer get() {
|
||||
return analyzer;
|
||||
}
|
||||
}, null, null, null);
|
||||
|
||||
}
|
||||
|
||||
public Map<String, TokenFilterFactory> buildTokenFilterFactories(IndexSettings indexSettings) throws IOException {
|
||||
final Map<String, Settings> tokenFiltersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_FILTER);
|
||||
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, this.tokenFilters,
|
||||
|
@ -184,12 +288,12 @@ public final class AnalysisRegistry implements Closeable {
|
|||
prebuiltAnalysis.preConfiguredCharFilterFactories);
|
||||
}
|
||||
|
||||
public Map<String, AnalyzerProvider<?>> buildAnalyzerFactories(IndexSettings indexSettings) throws IOException {
|
||||
private Map<String, AnalyzerProvider<?>> buildAnalyzerFactories(IndexSettings indexSettings) throws IOException {
|
||||
final Map<String, Settings> analyzersSettings = indexSettings.getSettings().getGroups("index.analysis.analyzer");
|
||||
return buildMapping(Component.ANALYZER, indexSettings, analyzersSettings, analyzers, prebuiltAnalysis.analyzerProviderFactories);
|
||||
}
|
||||
|
||||
public Map<String, AnalyzerProvider<?>> buildNormalizerFactories(IndexSettings indexSettings) throws IOException {
|
||||
private Map<String, AnalyzerProvider<?>> buildNormalizerFactories(IndexSettings indexSettings) throws IOException {
|
||||
final Map<String, Settings> normalizersSettings = indexSettings.getSettings().getGroups("index.analysis.normalizer");
|
||||
// TODO: Have pre-built normalizers
|
||||
return buildMapping(Component.NORMALIZER, indexSettings, normalizersSettings, normalizers, Collections.emptyMap());
|
||||
|
@ -203,7 +307,7 @@ public final class AnalysisRegistry implements Closeable {
|
|||
* @param indexSettings an index settings
|
||||
* @return {@link TokenizerFactory} provider or <code>null</code>
|
||||
*/
|
||||
public AnalysisProvider<TokenizerFactory> getTokenizerProvider(String tokenizer, IndexSettings indexSettings) {
|
||||
private AnalysisProvider<TokenizerFactory> getTokenizerProvider(String tokenizer, IndexSettings indexSettings) {
|
||||
return getProvider(Component.TOKENIZER, tokenizer, indexSettings, "index.analysis.tokenizer", tokenizers,
|
||||
this::getTokenizerProvider);
|
||||
}
|
||||
|
@ -216,7 +320,7 @@ public final class AnalysisRegistry implements Closeable {
|
|||
* @param indexSettings an index settings
|
||||
* @return {@link TokenFilterFactory} provider or <code>null</code>
|
||||
*/
|
||||
public AnalysisProvider<TokenFilterFactory> getTokenFilterProvider(String tokenFilter, IndexSettings indexSettings) {
|
||||
private AnalysisProvider<TokenFilterFactory> getTokenFilterProvider(String tokenFilter, IndexSettings indexSettings) {
|
||||
return getProvider(Component.FILTER, tokenFilter, indexSettings, "index.analysis.filter", tokenFilters,
|
||||
this::getTokenFilterProvider);
|
||||
}
|
||||
|
@ -229,7 +333,7 @@ public final class AnalysisRegistry implements Closeable {
|
|||
* @param indexSettings an index settings
|
||||
* @return {@link CharFilterFactory} provider or <code>null</code>
|
||||
*/
|
||||
public AnalysisProvider<CharFilterFactory> getCharFilterProvider(String charFilter, IndexSettings indexSettings) {
|
||||
private AnalysisProvider<CharFilterFactory> getCharFilterProvider(String charFilter, IndexSettings indexSettings) {
|
||||
return getProvider(Component.CHAR_FILTER, charFilter, indexSettings, "index.analysis.char_filter", charFilters,
|
||||
this::getCharFilterProvider);
|
||||
}
|
||||
|
@ -388,19 +492,19 @@ public final class AnalysisRegistry implements Closeable {
|
|||
this.preConfiguredTokenizers = preConfiguredTokenizers;
|
||||
}
|
||||
|
||||
public AnalysisProvider<CharFilterFactory> getCharFilterFactory(String name) {
|
||||
AnalysisProvider<CharFilterFactory> getCharFilterFactory(String name) {
|
||||
return preConfiguredCharFilterFactories.get(name);
|
||||
}
|
||||
|
||||
public AnalysisProvider<TokenFilterFactory> getTokenFilterFactory(String name) {
|
||||
AnalysisProvider<TokenFilterFactory> getTokenFilterFactory(String name) {
|
||||
return preConfiguredTokenFilters.get(name);
|
||||
}
|
||||
|
||||
public AnalysisProvider<TokenizerFactory> getTokenizerFactory(String name) {
|
||||
AnalysisProvider<TokenizerFactory> getTokenizerFactory(String name) {
|
||||
return preConfiguredTokenizers.get(name);
|
||||
}
|
||||
|
||||
public AnalysisProvider<AnalyzerProvider<?>> getAnalyzerProvider(String name) {
|
||||
AnalysisProvider<AnalyzerProvider<?>> getAnalyzerProvider(String name) {
|
||||
return analyzerProviderFactories.get(name);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.common.io.stream.Writeable;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.ToXContentFragment;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.common.xcontent.XContentParseException;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.common.xcontent.XContentType;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
public class NameOrDefinition implements Writeable, ToXContentFragment {
|
||||
// exactly one of these two members is not null
|
||||
public final String name;
|
||||
public final Settings definition;
|
||||
|
||||
public NameOrDefinition(String name) {
|
||||
this.name = Objects.requireNonNull(name);
|
||||
this.definition = null;
|
||||
}
|
||||
|
||||
public NameOrDefinition(Map<String, ?> definition) {
|
||||
this.name = null;
|
||||
Objects.requireNonNull(definition);
|
||||
try {
|
||||
XContentBuilder builder = XContentFactory.contentBuilder(XContentType.JSON);
|
||||
builder.map(definition);
|
||||
this.definition = Settings.builder().loadFromSource(Strings.toString(builder), builder.contentType()).build();
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException("Failed to parse [" + definition + "]", e);
|
||||
}
|
||||
}
|
||||
|
||||
public NameOrDefinition(StreamInput in) throws IOException {
|
||||
name = in.readOptionalString();
|
||||
if (in.readBoolean()) {
|
||||
definition = Settings.readSettingsFromStream(in);
|
||||
} else {
|
||||
definition = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeOptionalString(name);
|
||||
boolean isNotNullDefinition = this.definition != null;
|
||||
out.writeBoolean(isNotNullDefinition);
|
||||
if (isNotNullDefinition) {
|
||||
Settings.writeSettingsToStream(definition, out);
|
||||
}
|
||||
}
|
||||
|
||||
public static NameOrDefinition fromXContent(XContentParser parser) throws IOException {
|
||||
if (parser.currentToken() == XContentParser.Token.VALUE_STRING) {
|
||||
return new NameOrDefinition(parser.text());
|
||||
}
|
||||
if (parser.currentToken() == XContentParser.Token.START_OBJECT) {
|
||||
return new NameOrDefinition(parser.map());
|
||||
}
|
||||
throw new XContentParseException(parser.getTokenLocation(),
|
||||
"Expected [VALUE_STRING] or [START_OBJECT], got " + parser.currentToken());
|
||||
}
|
||||
|
||||
@Override
|
||||
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
if (definition == null) {
|
||||
builder.value(name);
|
||||
} else {
|
||||
builder.startObject();
|
||||
definition.toXContent(builder, params);
|
||||
builder.endObject();
|
||||
}
|
||||
return builder;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
NameOrDefinition that = (NameOrDefinition) o;
|
||||
return Objects.equals(name, that.name) &&
|
||||
Objects.equals(definition, that.definition);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(name, definition);
|
||||
}
|
||||
}
|
|
@ -21,6 +21,8 @@ package org.elasticsearch.action.admin.indices;
|
|||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.action.admin.indices.analyze.AnalyzeAction;
|
||||
import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction;
|
||||
|
@ -48,6 +50,7 @@ import org.elasticsearch.test.IndexSettingsModule;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -65,7 +68,6 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
|
||||
private IndexAnalyzers indexAnalyzers;
|
||||
private AnalysisRegistry registry;
|
||||
private Environment environment;
|
||||
private int maxTokenCount;
|
||||
private int idxMaxTokenCount;
|
||||
|
||||
|
@ -80,30 +82,45 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
|
||||
.put("index.analysis.analyzer.custom_analyzer.filter", "mock")
|
||||
.put("index.analysis.normalizer.my_normalizer.type", "custom")
|
||||
.put("index.analysis.char_filter.my_append.type", "append")
|
||||
.put("index.analysis.char_filter.my_append.suffix", "baz")
|
||||
.put("index.analyze.max_token_count", 100)
|
||||
.putList("index.analysis.normalizer.my_normalizer.filter", "lowercase").build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
||||
environment = TestEnvironment.newEnvironment(settings);
|
||||
Environment environment = TestEnvironment.newEnvironment(settings);
|
||||
AnalysisPlugin plugin = new AnalysisPlugin() {
|
||||
class MockFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
final CharacterRunAutomaton stopset;
|
||||
|
||||
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
if (settings.hasValue("stopword")) {
|
||||
this.stopset = new CharacterRunAutomaton(Automata.makeString(settings.get("stopword")));
|
||||
}
|
||||
else {
|
||||
this.stopset = MockTokenFilter.ENGLISH_STOPSET;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
|
||||
return new MockTokenFilter(tokenStream, this.stopset);
|
||||
}
|
||||
}
|
||||
|
||||
class AppendCharFilterFactory extends AbstractCharFilterFactory {
|
||||
|
||||
final String suffix;
|
||||
|
||||
AppendCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name);
|
||||
this.suffix = settings.get("suffix", "bar");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reader create(Reader reader) {
|
||||
return new AppendCharFilter(reader, "bar");
|
||||
return new AppendCharFilter(reader, suffix);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -149,7 +166,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request.text("the quick brown fox");
|
||||
request.analyzer("standard");
|
||||
AnalyzeAction.Response analyze
|
||||
= TransportAnalyzeAction.analyze(request, registry, environment, mockIndexService(), maxTokenCount);
|
||||
= TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount);
|
||||
List<AnalyzeAction.AnalyzeToken> tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
|
||||
|
@ -159,7 +176,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request.tokenizer("standard");
|
||||
request.addTokenFilter("mock");
|
||||
analyze
|
||||
= TransportAnalyzeAction.analyze(request, registry, environment, randomBoolean() ? mockIndexService() : null, maxTokenCount);
|
||||
= TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(3, tokens.size());
|
||||
assertEquals("qu1ck", tokens.get(0).getTerm());
|
||||
|
@ -172,7 +189,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request.tokenizer("standard");
|
||||
request.addCharFilter("append_foo");
|
||||
analyze
|
||||
= TransportAnalyzeAction.analyze(request, registry, environment, randomBoolean() ? mockIndexService() : null, maxTokenCount);
|
||||
= TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
|
@ -187,20 +204,38 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request.addCharFilter("append");
|
||||
request.text("the qu1ck brown fox");
|
||||
analyze
|
||||
= TransportAnalyzeAction.analyze(request, registry, environment, randomBoolean() ? mockIndexService() : null, maxTokenCount);
|
||||
= TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
assertEquals("qu1ck", tokens.get(1).getTerm());
|
||||
assertEquals("brown", tokens.get(2).getTerm());
|
||||
assertEquals("foxbar", tokens.get(3).getTerm());
|
||||
|
||||
// We can pass a new configuration
|
||||
request = new AnalyzeAction.Request();
|
||||
request.text("the qu1ck brown fox");
|
||||
request.tokenizer("standard");
|
||||
Map<String, Object> tokenFilterConfig = new HashMap<>();
|
||||
tokenFilterConfig.put("type", "mock");
|
||||
tokenFilterConfig.put("stopword", "brown");
|
||||
request.addTokenFilter(tokenFilterConfig);
|
||||
request.addCharFilter("append");
|
||||
request.text("the qu1ck brown fox");
|
||||
analyze
|
||||
= TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(3, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
assertEquals("qu1ck", tokens.get(1).getTerm());
|
||||
assertEquals("foxbar", tokens.get(2).getTerm());
|
||||
}
|
||||
|
||||
public void testFillsAttributes() throws IOException {
|
||||
AnalyzeAction.Request request = new AnalyzeAction.Request();
|
||||
request.analyzer("standard");
|
||||
request.text("the 1 brown fox");
|
||||
AnalyzeAction.Response analyze = TransportAnalyzeAction.analyze(request, registry, environment, null, maxTokenCount);
|
||||
AnalyzeAction.Response analyze = TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount);
|
||||
List<AnalyzeAction.AnalyzeToken> tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
|
@ -233,7 +268,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request.text("the quick brown fox");
|
||||
request.analyzer("custom_analyzer");
|
||||
AnalyzeAction.Response analyze
|
||||
= TransportAnalyzeAction.analyze(request, registry, environment, mockIndexService(), maxTokenCount);
|
||||
= TransportAnalyzeAction.analyze(request, registry, mockIndexService(), maxTokenCount);
|
||||
List<AnalyzeAction.AnalyzeToken> tokens = analyze.getTokens();
|
||||
assertEquals(3, tokens.size());
|
||||
assertEquals("quick", tokens.get(0).getTerm());
|
||||
|
@ -241,7 +276,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
assertEquals("fox", tokens.get(2).getTerm());
|
||||
|
||||
request.analyzer("standard");
|
||||
analyze = TransportAnalyzeAction.analyze(request, registry, environment, mockIndexService(), maxTokenCount);
|
||||
analyze = TransportAnalyzeAction.analyze(request, registry, mockIndexService(), maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
|
@ -252,7 +287,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
// Switch the analyzer out for just a tokenizer
|
||||
request.analyzer(null);
|
||||
request.tokenizer("standard");
|
||||
analyze = TransportAnalyzeAction.analyze(request, registry, environment, mockIndexService(), maxTokenCount);
|
||||
analyze = TransportAnalyzeAction.analyze(request, registry, mockIndexService(), maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
|
@ -262,12 +297,33 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
|
||||
// Now try applying our token filter
|
||||
request.addTokenFilter("mock");
|
||||
analyze = TransportAnalyzeAction.analyze(request, registry, environment, mockIndexService(), maxTokenCount);
|
||||
analyze = TransportAnalyzeAction.analyze(request, registry, mockIndexService(), maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(3, tokens.size());
|
||||
assertEquals("quick", tokens.get(0).getTerm());
|
||||
assertEquals("brown", tokens.get(1).getTerm());
|
||||
assertEquals("fox", tokens.get(2).getTerm());
|
||||
|
||||
// Apply the char filter, checking that the correct configuration gets passed on
|
||||
request.addCharFilter("my_append");
|
||||
analyze = TransportAnalyzeAction.analyze(request, registry, mockIndexService(), maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(3, tokens.size());
|
||||
assertEquals("quick", tokens.get(0).getTerm());
|
||||
assertEquals("brown", tokens.get(1).getTerm());
|
||||
assertEquals("foxbaz", tokens.get(2).getTerm());
|
||||
|
||||
// Apply a token filter with parameters
|
||||
Map<String, Object> tokenFilterConfig = new HashMap<>();
|
||||
tokenFilterConfig.put("type", "mock");
|
||||
tokenFilterConfig.put("stopword", "brown");
|
||||
request.addTokenFilter(tokenFilterConfig);
|
||||
analyze = TransportAnalyzeAction.analyze(request, registry, mockIndexService(), maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(2, tokens.size());
|
||||
assertEquals("quick", tokens.get(0).getTerm());
|
||||
assertEquals("foxbaz", tokens.get(1).getTerm());
|
||||
|
||||
}
|
||||
|
||||
public void testGetIndexAnalyserWithoutIndexAnalyzers() {
|
||||
|
@ -276,14 +332,14 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
new AnalyzeAction.Request()
|
||||
.analyzer("custom_analyzer")
|
||||
.text("the qu1ck brown fox-dog"),
|
||||
registry, environment, null, maxTokenCount));
|
||||
registry, null, maxTokenCount));
|
||||
assertEquals(e.getMessage(), "failed to find global analyzer [custom_analyzer]");
|
||||
}
|
||||
|
||||
public void testGetFieldAnalyzerWithoutIndexAnalyzers() {
|
||||
AnalyzeAction.Request req = new AnalyzeAction.Request().field("field").text("text");
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
|
||||
TransportAnalyzeAction.analyze(req, registry, environment, null, maxTokenCount);
|
||||
TransportAnalyzeAction.analyze(req, registry, null, maxTokenCount);
|
||||
});
|
||||
assertEquals(e.getMessage(), "analysis based on a specific field requires an index");
|
||||
}
|
||||
|
@ -295,7 +351,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
new AnalyzeAction.Request()
|
||||
.analyzer("foobar")
|
||||
.text("the qu1ck brown fox"),
|
||||
registry, environment, notGlobal ? mockIndexService() : null, maxTokenCount));
|
||||
registry, notGlobal ? mockIndexService() : null, maxTokenCount));
|
||||
if (notGlobal) {
|
||||
assertEquals(e.getMessage(), "failed to find analyzer [foobar]");
|
||||
} else {
|
||||
|
@ -307,7 +363,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
new AnalyzeAction.Request()
|
||||
.tokenizer("foobar")
|
||||
.text("the qu1ck brown fox"),
|
||||
registry, environment, notGlobal ? mockIndexService() : null, maxTokenCount));
|
||||
registry, notGlobal ? mockIndexService() : null, maxTokenCount));
|
||||
if (notGlobal) {
|
||||
assertEquals(e.getMessage(), "failed to find tokenizer under [foobar]");
|
||||
} else {
|
||||
|
@ -320,11 +376,11 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
.tokenizer("standard")
|
||||
.addTokenFilter("foobar")
|
||||
.text("the qu1ck brown fox"),
|
||||
registry, environment, notGlobal ? mockIndexService() : null, maxTokenCount));
|
||||
registry, notGlobal ? mockIndexService() : null, maxTokenCount));
|
||||
if (notGlobal) {
|
||||
assertEquals(e.getMessage(), "failed to find token filter under [foobar]");
|
||||
assertEquals(e.getMessage(), "failed to find filter under [foobar]");
|
||||
} else {
|
||||
assertEquals(e.getMessage(), "failed to find global token filter under [foobar]");
|
||||
assertEquals(e.getMessage(), "failed to find global filter under [foobar]");
|
||||
}
|
||||
|
||||
e = expectThrows(IllegalArgumentException.class,
|
||||
|
@ -334,11 +390,11 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
.addTokenFilter("lowercase")
|
||||
.addCharFilter("foobar")
|
||||
.text("the qu1ck brown fox"),
|
||||
registry, environment, notGlobal ? mockIndexService() : null, maxTokenCount));
|
||||
registry, notGlobal ? mockIndexService() : null, maxTokenCount));
|
||||
if (notGlobal) {
|
||||
assertEquals(e.getMessage(), "failed to find char filter under [foobar]");
|
||||
assertEquals(e.getMessage(), "failed to find char_filter under [foobar]");
|
||||
} else {
|
||||
assertEquals(e.getMessage(), "failed to find global char filter under [foobar]");
|
||||
assertEquals(e.getMessage(), "failed to find global char_filter under [foobar]");
|
||||
}
|
||||
|
||||
e = expectThrows(IllegalArgumentException.class,
|
||||
|
@ -346,7 +402,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
new AnalyzeAction.Request()
|
||||
.normalizer("foobar")
|
||||
.text("the qu1ck brown fox"),
|
||||
registry, environment, mockIndexService(), maxTokenCount));
|
||||
registry, mockIndexService(), maxTokenCount));
|
||||
assertEquals(e.getMessage(), "failed to find normalizer under [foobar]");
|
||||
}
|
||||
|
||||
|
@ -356,7 +412,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request.addTokenFilter("stop"); // stop token filter is not prebuilt in AnalysisModule#setupPreConfiguredTokenFilters()
|
||||
request.text("the quick brown fox");
|
||||
AnalyzeAction.Response analyze
|
||||
= TransportAnalyzeAction.analyze(request, registry, environment, mockIndexService(), maxTokenCount);
|
||||
= TransportAnalyzeAction.analyze(request, registry, mockIndexService(), maxTokenCount);
|
||||
List<AnalyzeAction.AnalyzeToken> tokens = analyze.getTokens();
|
||||
assertEquals(3, tokens.size());
|
||||
assertEquals("quick", tokens.get(0).getTerm());
|
||||
|
@ -364,12 +420,28 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
assertEquals("fox", tokens.get(2).getTerm());
|
||||
}
|
||||
|
||||
public void testCustomCharFilterWithParameters() throws IOException {
|
||||
AnalyzeAction.Request request = new AnalyzeAction.Request();
|
||||
request.tokenizer("standard");
|
||||
Map<String, Object> charFilterConfig = new HashMap<>();
|
||||
charFilterConfig.put("type", "append");
|
||||
charFilterConfig.put("suffix", "foo");
|
||||
request.addCharFilter(charFilterConfig);
|
||||
request.text("quick brown");
|
||||
AnalyzeAction.Response analyze =
|
||||
TransportAnalyzeAction.analyze(request, registry, mockIndexService(), maxTokenCount);
|
||||
List<AnalyzeAction.AnalyzeToken> tokens = analyze.getTokens();
|
||||
assertEquals(2, tokens.size());
|
||||
assertEquals("quick", tokens.get(0).getTerm());
|
||||
assertEquals("brownfoo", tokens.get(1).getTerm());
|
||||
}
|
||||
|
||||
public void testNormalizerWithIndex() throws IOException {
|
||||
AnalyzeAction.Request request = new AnalyzeAction.Request("index");
|
||||
request.normalizer("my_normalizer");
|
||||
request.text("ABc");
|
||||
AnalyzeAction.Response analyze
|
||||
= TransportAnalyzeAction.analyze(request, registry, environment, mockIndexService(), maxTokenCount);
|
||||
= TransportAnalyzeAction.analyze(request, registry, mockIndexService(), maxTokenCount);
|
||||
List<AnalyzeAction.AnalyzeToken> tokens = analyze.getTokens();
|
||||
|
||||
assertEquals(1, tokens.size());
|
||||
|
@ -394,7 +466,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request.text(text);
|
||||
request.analyzer("standard");
|
||||
IllegalStateException e = expectThrows(IllegalStateException.class,
|
||||
() -> TransportAnalyzeAction.analyze(request, registry, environment, null, maxTokenCount));
|
||||
() -> TransportAnalyzeAction.analyze(request, registry, null, maxTokenCount));
|
||||
assertEquals(e.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
|
||||
+ maxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
|
||||
|
||||
|
@ -404,7 +476,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request2.analyzer("standard");
|
||||
request2.explain(true);
|
||||
IllegalStateException e2 = expectThrows(IllegalStateException.class,
|
||||
() -> TransportAnalyzeAction.analyze(request2, registry, environment, null, maxTokenCount));
|
||||
() -> TransportAnalyzeAction.analyze(request2, registry, null, maxTokenCount));
|
||||
assertEquals(e2.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
|
||||
+ maxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
|
||||
}
|
||||
|
@ -426,7 +498,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request.text(text);
|
||||
request.analyzer("standard");
|
||||
IllegalStateException e = expectThrows(IllegalStateException.class,
|
||||
() -> TransportAnalyzeAction.analyze(request, registry, environment, null, idxMaxTokenCount));
|
||||
() -> TransportAnalyzeAction.analyze(request, registry, null, idxMaxTokenCount));
|
||||
assertEquals(e.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
|
||||
+ idxMaxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
|
||||
}
|
||||
|
|
|
@ -345,7 +345,7 @@ public class AnalyzeActionIT extends ESIntegTestCase {
|
|||
assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getPositionLength(), equalTo(1));
|
||||
|
||||
// tokenfilter({"type": "stop", "stopwords": ["foo", "buzz"]})
|
||||
assertThat(analyzeResponse.detail().tokenfilters()[1].getName(), equalTo("_anonymous_tokenfilter"));
|
||||
assertThat(analyzeResponse.detail().tokenfilters()[1].getName(), equalTo("__anonymous__stop"));
|
||||
assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens().length, equalTo(1));
|
||||
|
||||
assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens()[0].getTerm(), equalTo("test"));
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.elasticsearch.common.settings.Settings;
|
|||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.common.xcontent.XContentType;
|
||||
import org.elasticsearch.index.analysis.NameOrDefinition;
|
||||
import org.elasticsearch.rest.RestController;
|
||||
import org.elasticsearch.rest.RestRequest;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
@ -52,7 +53,7 @@ public class RestAnalyzeActionTests extends ESTestCase {
|
|||
assertThat(analyzeRequest.text(), equalTo(new String[]{"THIS IS A TEST"}));
|
||||
assertThat(analyzeRequest.tokenizer().name, equalTo("keyword"));
|
||||
assertThat(analyzeRequest.tokenFilters().size(), equalTo(1));
|
||||
for (AnalyzeAction.Request.NameOrDefinition filter : analyzeRequest.tokenFilters()) {
|
||||
for (NameOrDefinition filter : analyzeRequest.tokenFilters()) {
|
||||
assertThat(filter.name, equalTo("lowercase"));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,16 +6,13 @@
|
|||
package org.elasticsearch.xpack.core.ml.job.config;
|
||||
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.common.io.stream.Writeable;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.ToXContentFragment;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.common.xcontent.XContentType;
|
||||
import org.elasticsearch.index.analysis.NameOrDefinition;
|
||||
import org.elasticsearch.rest.action.admin.indices.RestAnalyzeAction;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -176,87 +173,6 @@ public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeab
|
|||
return builder.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple store of either a name of a built-in analyzer element or a custom definition.
|
||||
*/
|
||||
public static class NameOrDefinition implements ToXContentFragment, Writeable {
|
||||
|
||||
// Exactly one of these two members is not null
|
||||
public final String name;
|
||||
public final Settings definition;
|
||||
|
||||
NameOrDefinition(String name) {
|
||||
this.name = Objects.requireNonNull(name);
|
||||
this.definition = null;
|
||||
}
|
||||
|
||||
NameOrDefinition(ParseField field, Map<String, Object> definition) {
|
||||
this.name = null;
|
||||
Objects.requireNonNull(definition);
|
||||
try {
|
||||
XContentBuilder builder = XContentFactory.contentBuilder(XContentType.JSON);
|
||||
builder.map(definition);
|
||||
this.definition = Settings.builder().loadFromSource(Strings.toString(builder), builder.contentType()).build();
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException("Failed to parse [" + definition + "] in [" + field.getPreferredName() + "]", e);
|
||||
}
|
||||
}
|
||||
|
||||
NameOrDefinition(StreamInput in) throws IOException {
|
||||
name = in.readOptionalString();
|
||||
if (in.readBoolean()) {
|
||||
definition = Settings.readSettingsFromStream(in);
|
||||
} else {
|
||||
definition = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeOptionalString(name);
|
||||
boolean isNotNullDefinition = this.definition != null;
|
||||
out.writeBoolean(isNotNullDefinition);
|
||||
if (isNotNullDefinition) {
|
||||
Settings.writeSettingsToStream(definition, out);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
if (definition == null) {
|
||||
builder.value(name);
|
||||
} else {
|
||||
builder.startObject();
|
||||
definition.toXContent(builder, params);
|
||||
builder.endObject();
|
||||
}
|
||||
return builder;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
NameOrDefinition that = (NameOrDefinition) o;
|
||||
return Objects.equals(name, that.name) &&
|
||||
Objects.equals(definition, that.definition);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(name, definition);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (definition == null) {
|
||||
return name;
|
||||
} else {
|
||||
return definition.toDelimitedString(';');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final String analyzer;
|
||||
private final List<NameOrDefinition> charFilters;
|
||||
private final NameOrDefinition tokenizer;
|
||||
|
@ -373,7 +289,7 @@ public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeab
|
|||
}
|
||||
|
||||
public Builder addCharFilter(Map<String, Object> charFilter) {
|
||||
this.charFilters.add(new NameOrDefinition(CHAR_FILTERS, charFilter));
|
||||
this.charFilters.add(new NameOrDefinition(charFilter));
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -383,7 +299,7 @@ public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeab
|
|||
}
|
||||
|
||||
public Builder setTokenizer(Map<String, Object> tokenizer) {
|
||||
this.tokenizer = new NameOrDefinition(TOKENIZER, tokenizer);
|
||||
this.tokenizer = new NameOrDefinition(tokenizer);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -393,7 +309,7 @@ public class CategorizationAnalyzerConfig implements ToXContentFragment, Writeab
|
|||
}
|
||||
|
||||
public Builder addTokenFilter(Map<String, Object> tokenFilter) {
|
||||
this.tokenFilters.add(new NameOrDefinition(TOKEN_FILTERS, tokenFilter));
|
||||
this.tokenFilters.add(new NameOrDefinition(tokenFilter));
|
||||
return this;
|
||||
}
|
||||
|
||||
|
|
|
@ -226,7 +226,7 @@ public class JobManager {
|
|||
CategorizationAnalyzerConfig categorizationAnalyzerConfig = jobBuilder.getAnalysisConfig().getCategorizationAnalyzerConfig();
|
||||
if (categorizationAnalyzerConfig != null) {
|
||||
CategorizationAnalyzer.verifyConfigBuilder(new CategorizationAnalyzerConfig.Builder(categorizationAnalyzerConfig),
|
||||
analysisRegistry, environment);
|
||||
analysisRegistry);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -9,20 +9,8 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.action.admin.indices.analyze.TransportAnalyzeAction;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.common.UUIDs;
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
import org.elasticsearch.xpack.core.ml.job.config.CategorizationAnalyzerConfig;
|
||||
|
||||
import java.io.Closeable;
|
||||
|
@ -35,21 +23,16 @@ import java.util.List;
|
|||
*
|
||||
* Converts messages to lists of tokens that will be fed to the ML categorization algorithm.
|
||||
*
|
||||
* The code in {@link #makeAnalyzer} and the methods it calls is largely copied from {@link TransportAnalyzeAction}.
|
||||
* Unfortunately there is no easy way to reuse a subset of the <code>_analyze</code> action implementation, as the
|
||||
* logic required here is not quite identical to that of {@link TransportAnalyzeAction}, and the required code is
|
||||
* hard to partially reuse.
|
||||
* TODO: consider refactoring ES core to allow more reuse.
|
||||
*/
|
||||
public class CategorizationAnalyzer implements Closeable {
|
||||
|
||||
private final Analyzer analyzer;
|
||||
private final boolean closeAnalyzer;
|
||||
|
||||
public CategorizationAnalyzer(AnalysisRegistry analysisRegistry, Environment environment,
|
||||
public CategorizationAnalyzer(AnalysisRegistry analysisRegistry,
|
||||
CategorizationAnalyzerConfig categorizationAnalyzerConfig) throws IOException {
|
||||
|
||||
Tuple<Analyzer, Boolean> tuple = makeAnalyzer(categorizationAnalyzerConfig, analysisRegistry, environment);
|
||||
Tuple<Analyzer, Boolean> tuple = makeAnalyzer(categorizationAnalyzerConfig, analysisRegistry);
|
||||
analyzer = tuple.v1();
|
||||
closeAnalyzer = tuple.v2();
|
||||
}
|
||||
|
@ -93,9 +76,9 @@ public class CategorizationAnalyzer implements Closeable {
|
|||
* server-side rather than client-side, as the client will not have loaded the appropriate analysis
|
||||
* modules/plugins.
|
||||
*/
|
||||
public static void verifyConfigBuilder(CategorizationAnalyzerConfig.Builder configBuilder, AnalysisRegistry analysisRegistry,
|
||||
Environment environment) throws IOException {
|
||||
Tuple<Analyzer, Boolean> tuple = makeAnalyzer(configBuilder.build(), analysisRegistry, environment);
|
||||
public static void verifyConfigBuilder(CategorizationAnalyzerConfig.Builder configBuilder, AnalysisRegistry analysisRegistry)
|
||||
throws IOException {
|
||||
Tuple<Analyzer, Boolean> tuple = makeAnalyzer(configBuilder.build(), analysisRegistry);
|
||||
if (tuple.v2()) {
|
||||
tuple.v1().close();
|
||||
}
|
||||
|
@ -108,8 +91,8 @@ public class CategorizationAnalyzer implements Closeable {
|
|||
* @return The first tuple member is the {@link Analyzer}; the second indicates whether the caller is responsible
|
||||
* for closing it.
|
||||
*/
|
||||
private static Tuple<Analyzer, Boolean> makeAnalyzer(CategorizationAnalyzerConfig config, AnalysisRegistry analysisRegistry,
|
||||
Environment environment) throws IOException {
|
||||
private static Tuple<Analyzer, Boolean> makeAnalyzer(CategorizationAnalyzerConfig config, AnalysisRegistry analysisRegistry)
|
||||
throws IOException {
|
||||
String analyzer = config.getAnalyzer();
|
||||
if (analyzer != null) {
|
||||
Analyzer globalAnalyzer = analysisRegistry.getAnalyzer(analyzer);
|
||||
|
@ -118,162 +101,9 @@ public class CategorizationAnalyzer implements Closeable {
|
|||
}
|
||||
return new Tuple<>(globalAnalyzer, Boolean.FALSE);
|
||||
} else {
|
||||
List<CharFilterFactory> charFilterFactoryList = parseCharFilterFactories(config, analysisRegistry, environment);
|
||||
|
||||
Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(config, analysisRegistry, environment);
|
||||
|
||||
List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(config, analysisRegistry, environment,
|
||||
tokenizerFactory, charFilterFactoryList);
|
||||
|
||||
return new Tuple<>(new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
|
||||
charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
|
||||
tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()])), Boolean.TRUE);
|
||||
return new Tuple<>(analysisRegistry.buildCustomAnalyzer(null, false,
|
||||
config.getTokenizer(), config.getCharFilters(), config.getTokenFilters()), Boolean.TRUE);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get char filter factories for each configured char filter. Each configuration
|
||||
* element can be the name of an out-of-the-box char filter, or a custom definition.
|
||||
*/
|
||||
private static List<CharFilterFactory> parseCharFilterFactories(CategorizationAnalyzerConfig config, AnalysisRegistry analysisRegistry,
|
||||
Environment environment) throws IOException {
|
||||
List<CategorizationAnalyzerConfig.NameOrDefinition> charFilters = config.getCharFilters();
|
||||
final List<CharFilterFactory> charFilterFactoryList = new ArrayList<>();
|
||||
for (CategorizationAnalyzerConfig.NameOrDefinition charFilter : charFilters) {
|
||||
final CharFilterFactory charFilterFactory;
|
||||
if (charFilter.name != null) {
|
||||
AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
|
||||
analysisRegistry.getCharFilterProvider(charFilter.name);
|
||||
if (charFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("Failed to find global char filter under [" + charFilter.name + "]");
|
||||
}
|
||||
charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name);
|
||||
} else {
|
||||
String charFilterTypeName = charFilter.definition.get("type");
|
||||
if (charFilterTypeName == null) {
|
||||
throw new IllegalArgumentException("Missing [type] setting for char filter: " + charFilter.definition);
|
||||
}
|
||||
AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory =
|
||||
analysisRegistry.getCharFilterProvider(charFilterTypeName);
|
||||
if (charFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("Failed to find global char filter under [" + charFilterTypeName + "]");
|
||||
}
|
||||
Settings settings = augmentSettings(charFilter.definition);
|
||||
// Need to set anonymous "name" of char_filter
|
||||
charFilterFactory = charFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, "_anonymous_charfilter",
|
||||
settings);
|
||||
}
|
||||
if (charFilterFactory == null) {
|
||||
throw new IllegalArgumentException("Failed to find char filter [" + charFilter + "]");
|
||||
}
|
||||
charFilterFactoryList.add(charFilterFactory);
|
||||
}
|
||||
return charFilterFactoryList;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the tokenizer factory for the configured tokenizer. The configuration
|
||||
* can be the name of an out-of-the-box tokenizer, or a custom definition.
|
||||
*/
|
||||
private static Tuple<String, TokenizerFactory> parseTokenizerFactory(CategorizationAnalyzerConfig config,
|
||||
AnalysisRegistry analysisRegistry, Environment environment)
|
||||
throws IOException {
|
||||
CategorizationAnalyzerConfig.NameOrDefinition tokenizer = config.getTokenizer();
|
||||
final String name;
|
||||
final TokenizerFactory tokenizerFactory;
|
||||
if (tokenizer.name != null) {
|
||||
name = tokenizer.name;
|
||||
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory = analysisRegistry.getTokenizerProvider(name);
|
||||
if (tokenizerFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("Failed to find global tokenizer under [" + name + "]");
|
||||
}
|
||||
tokenizerFactory = tokenizerFactoryFactory.get(environment, name);
|
||||
} else {
|
||||
String tokenizerTypeName = tokenizer.definition.get("type");
|
||||
if (tokenizerTypeName == null) {
|
||||
throw new IllegalArgumentException("Missing [type] setting for tokenizer: " + tokenizer.definition);
|
||||
}
|
||||
AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
|
||||
analysisRegistry.getTokenizerProvider(tokenizerTypeName);
|
||||
if (tokenizerFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("Failed to find global tokenizer under [" + tokenizerTypeName + "]");
|
||||
}
|
||||
Settings settings = augmentSettings(tokenizer.definition);
|
||||
// Need to set anonymous "name" of tokenizer
|
||||
name = "_anonymous_tokenizer";
|
||||
tokenizerFactory = tokenizerFactoryFactory.get(buildDummyIndexSettings(settings), environment, name, settings);
|
||||
}
|
||||
return new Tuple<>(name, tokenizerFactory);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get token filter factories for each configured token filter. Each configuration
|
||||
* element can be the name of an out-of-the-box token filter, or a custom definition.
|
||||
*/
|
||||
private static List<TokenFilterFactory> parseTokenFilterFactories(CategorizationAnalyzerConfig config,
|
||||
AnalysisRegistry analysisRegistry, Environment environment,
|
||||
Tuple<String, TokenizerFactory> tokenizerFactory,
|
||||
List<CharFilterFactory> charFilterFactoryList) throws IOException {
|
||||
List<CategorizationAnalyzerConfig.NameOrDefinition> tokenFilters = config.getTokenFilters();
|
||||
TransportAnalyzeAction.DeferredTokenFilterRegistry deferredRegistry
|
||||
= new TransportAnalyzeAction.DeferredTokenFilterRegistry(analysisRegistry, null);
|
||||
final List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
|
||||
for (CategorizationAnalyzerConfig.NameOrDefinition tokenFilter : tokenFilters) {
|
||||
TokenFilterFactory tokenFilterFactory;
|
||||
if (tokenFilter.name != null) {
|
||||
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
|
||||
tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name);
|
||||
if (tokenFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("Failed to find global token filter under [" + tokenFilter.name + "]");
|
||||
}
|
||||
tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
|
||||
} else {
|
||||
String filterTypeName = tokenFilter.definition.get("type");
|
||||
if (filterTypeName == null) {
|
||||
throw new IllegalArgumentException("Missing [type] setting for token filter: " + tokenFilter.definition);
|
||||
}
|
||||
AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory =
|
||||
analysisRegistry.getTokenFilterProvider(filterTypeName);
|
||||
if (tokenFilterFactoryFactory == null) {
|
||||
throw new IllegalArgumentException("Failed to find global token filter under [" + filterTypeName + "]");
|
||||
}
|
||||
Settings settings = augmentSettings(tokenFilter.definition);
|
||||
// Need to set anonymous "name" of token_filter
|
||||
tokenFilterFactory = tokenFilterFactoryFactory.get(buildDummyIndexSettings(settings), environment, "_anonymous_tokenfilter",
|
||||
settings);
|
||||
tokenFilterFactory = tokenFilterFactory.getChainAwareTokenFilterFactory(tokenizerFactory.v2(),
|
||||
charFilterFactoryList, tokenFilterFactoryList, deferredRegistry);
|
||||
}
|
||||
if (tokenFilterFactory == null) {
|
||||
throw new IllegalArgumentException("Failed to find or create token filter [" + tokenFilter + "]");
|
||||
}
|
||||
tokenFilterFactoryList.add(tokenFilterFactory);
|
||||
}
|
||||
return tokenFilterFactoryList;
|
||||
}
|
||||
|
||||
/**
|
||||
* The Elasticsearch analysis functionality is designed to work with indices. For
|
||||
* categorization we have to pretend we've got some index settings.
|
||||
*/
|
||||
private static IndexSettings buildDummyIndexSettings(Settings settings) {
|
||||
IndexMetaData metaData = IndexMetaData.builder(IndexMetaData.INDEX_UUID_NA_VALUE).settings(settings).build();
|
||||
return new IndexSettings(metaData, Settings.EMPTY);
|
||||
}
|
||||
|
||||
/**
|
||||
* The behaviour of Elasticsearch analyzers can vary between versions.
|
||||
* For categorization we'll always use the latest version of the text analysis.
|
||||
* The other settings are just to stop classes that expect to be associated with
|
||||
* an index from complaining.
|
||||
*/
|
||||
private static Settings augmentSettings(Settings settings) {
|
||||
return Settings.builder().put(settings)
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
|
||||
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
|
||||
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -384,6 +384,6 @@ public class AutodetectCommunicator implements Closeable {
|
|||
categorizationAnalyzerConfig =
|
||||
CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(analysisConfig.getCategorizationFilters());
|
||||
}
|
||||
categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, categorizationAnalyzerConfig);
|
||||
categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, categorizationAnalyzerConfig);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,7 +25,6 @@ import java.util.Map;
|
|||
public class CategorizationAnalyzerTests extends ESTestCase {
|
||||
|
||||
private AnalysisRegistry analysisRegistry;
|
||||
private Environment environment;
|
||||
|
||||
public static AnalysisRegistry buildTestAnalysisRegistry(Environment environment) throws Exception {
|
||||
CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin();
|
||||
|
@ -36,32 +35,32 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
@Before
|
||||
public void setup() throws Exception {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build();
|
||||
environment = TestEnvironment.newEnvironment(settings);
|
||||
Environment environment = TestEnvironment.newEnvironment(settings);
|
||||
analysisRegistry = buildTestAnalysisRegistry(environment);
|
||||
}
|
||||
|
||||
public void testVerifyConfigBuilder_GivenNoConfig() {
|
||||
CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder();
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry));
|
||||
assertEquals("categorization_analyzer that is not a global analyzer must specify a [tokenizer] field", e.getMessage());
|
||||
}
|
||||
|
||||
public void testVerifyConfigBuilder_GivenDefault() throws IOException {
|
||||
CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null);
|
||||
CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder(defaultConfig);
|
||||
CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment);
|
||||
CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry);
|
||||
}
|
||||
|
||||
public void testVerifyConfigBuilder_GivenValidAnalyzer() throws IOException {
|
||||
CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder().setAnalyzer("standard");
|
||||
CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment);
|
||||
CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry);
|
||||
}
|
||||
|
||||
public void testVerifyConfigBuilder_GivenInvalidAnalyzer() {
|
||||
CategorizationAnalyzerConfig.Builder builder = new CategorizationAnalyzerConfig.Builder().setAnalyzer("does not exist");
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry));
|
||||
assertEquals("Failed to find global analyzer [does not exist]", e.getMessage());
|
||||
}
|
||||
|
||||
|
@ -78,7 +77,7 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
.addTokenFilter("lowercase")
|
||||
.addTokenFilter(ignoreStuffThatBeginsWithADigit)
|
||||
.addTokenFilter("snowball");
|
||||
CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment);
|
||||
CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry);
|
||||
}
|
||||
|
||||
public void testVerifyConfigBuilder_GivenCustomConfigWithInvalidCharFilter() {
|
||||
|
@ -88,8 +87,8 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
.addTokenFilter("lowercase")
|
||||
.addTokenFilter("snowball");
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
|
||||
assertEquals("Failed to find global char filter under [wrong!]", e.getMessage());
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry));
|
||||
assertEquals("failed to find global char_filter under [wrong!]", e.getMessage());
|
||||
}
|
||||
|
||||
public void testVerifyConfigBuilder_GivenCustomConfigWithMisconfiguredCharFilter() {
|
||||
|
@ -102,8 +101,8 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
.addTokenFilter("lowercase")
|
||||
.addTokenFilter("snowball");
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
|
||||
assertEquals("pattern is missing for [_anonymous_charfilter] char filter of type 'pattern_replace'", e.getMessage());
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry));
|
||||
assertEquals("pattern is missing for [__anonymous__pattern_replace] char filter of type 'pattern_replace'", e.getMessage());
|
||||
}
|
||||
|
||||
public void testVerifyConfigBuilder_GivenCustomConfigWithInvalidTokenizer() {
|
||||
|
@ -116,8 +115,8 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
.addTokenFilter("lowercase")
|
||||
.addTokenFilter("snowball");
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
|
||||
assertEquals("Failed to find global tokenizer under [oops!]", e.getMessage());
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry));
|
||||
assertEquals("failed to find global tokenizer under [oops!]", e.getMessage());
|
||||
}
|
||||
|
||||
public void testVerifyConfigBuilder_GivenNoTokenizer() {
|
||||
|
@ -133,7 +132,7 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
.addTokenFilter(ignoreStuffThatBeginsWithADigit)
|
||||
.addTokenFilter("snowball");
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry));
|
||||
assertEquals("categorization_analyzer that is not a global analyzer must specify a [tokenizer] field", e.getMessage());
|
||||
}
|
||||
|
||||
|
@ -147,8 +146,8 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
.addTokenFilter("lowercase")
|
||||
.addTokenFilter("oh dear!");
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
|
||||
assertEquals("Failed to find global token filter under [oh dear!]", e.getMessage());
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry));
|
||||
assertEquals("failed to find global filter under [oh dear!]", e.getMessage());
|
||||
}
|
||||
|
||||
public void testVerifyConfigBuilder_GivenCustomConfigWithMisconfiguredTokenFilter() {
|
||||
|
@ -161,8 +160,8 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
.addTokenFilter("lowercase")
|
||||
.addTokenFilter(noPattern);
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
|
||||
assertEquals("pattern is missing for [_anonymous_tokenfilter] token filter of type 'pattern_replace'", e.getMessage());
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry));
|
||||
assertEquals("pattern is missing for [__anonymous__pattern_replace] token filter of type 'pattern_replace'", e.getMessage());
|
||||
}
|
||||
|
||||
public void testVerifyConfigBuilder_GivenAnalyzerAndCharFilter() {
|
||||
|
@ -170,7 +169,7 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
.setAnalyzer("standard")
|
||||
.addCharFilter("html_strip");
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry));
|
||||
assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [char_filter] field", e.getMessage());
|
||||
}
|
||||
|
||||
|
@ -179,7 +178,7 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
.setAnalyzer("standard")
|
||||
.setTokenizer("classic");
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry));
|
||||
assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [tokenizer] field", e.getMessage());
|
||||
}
|
||||
|
||||
|
@ -188,14 +187,14 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
.setAnalyzer("standard")
|
||||
.addTokenFilter("lowercase");
|
||||
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry, environment));
|
||||
() -> CategorizationAnalyzer.verifyConfigBuilder(builder, analysisRegistry));
|
||||
assertEquals("categorization_analyzer that is a global analyzer cannot also specify a [filter] field", e.getMessage());
|
||||
}
|
||||
|
||||
// The default categorization analyzer matches what the analyzer in the ML C++ does
|
||||
public void testDefaultCategorizationAnalyzer() throws IOException {
|
||||
CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null);
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, defaultConfig)) {
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, defaultConfig)) {
|
||||
|
||||
assertEquals(Arrays.asList("ml13-4608.1.p2ps", "Info", "Source", "ML_SERVICE2", "on", "has", "shut", "down"),
|
||||
categorizationAnalyzer.tokenizeField("p2ps",
|
||||
|
@ -225,7 +224,7 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
// A categorization filter that removes stuff in square brackets
|
||||
CategorizationAnalyzerConfig defaultConfigWithCategorizationFilter =
|
||||
CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(Collections.singletonList("\\[[^\\]]*\\]"));
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment,
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry,
|
||||
defaultConfigWithCategorizationFilter)) {
|
||||
|
||||
assertEquals(Arrays.asList("ml13-4608.1.p2ps", "Info", "Source", "ML_SERVICE2", "on", "has", "shut", "down"),
|
||||
|
@ -255,7 +254,7 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
// NOT for ML categorization (and you'll see why if you look at the expected results of this test!)
|
||||
public void testStandardAnalyzer() throws IOException {
|
||||
CategorizationAnalyzerConfig config = new CategorizationAnalyzerConfig.Builder().setAnalyzer("standard").build();
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, config)) {
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, config)) {
|
||||
|
||||
assertEquals(Arrays.asList("ml13", "4608.1", "p2ps", "info", "source", "ml_service2", "on", "13122", "867", "has", "shut",
|
||||
"down"),
|
||||
|
@ -298,7 +297,7 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
.addTokenFilter(ignoreStuffThatBeginsWithADigit)
|
||||
.addTokenFilter("snowball")
|
||||
.build();
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, config)) {
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, config)) {
|
||||
|
||||
assertEquals(Arrays.asList("ml13-4608.1.p2ps", "info", "sourc", "ml_service2", "on", "has", "shut", "down"),
|
||||
categorizationAnalyzer.tokenizeField("p2ps",
|
||||
|
@ -325,14 +324,14 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
|
||||
public void testEmptyString() throws IOException {
|
||||
CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null);
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, defaultConfig)) {
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, defaultConfig)) {
|
||||
assertEquals(Collections.emptyList(), categorizationAnalyzer.tokenizeField("foo", ""));
|
||||
}
|
||||
}
|
||||
|
||||
public void testThaiAnalyzer() throws IOException {
|
||||
CategorizationAnalyzerConfig config = new CategorizationAnalyzerConfig.Builder().setAnalyzer("thai").build();
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, config)) {
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, config)) {
|
||||
|
||||
// An example from the ES docs - no idea what it means or whether it's remotely sensible from a categorization point-of-view
|
||||
assertEquals(Arrays.asList("แสดง", "งาน", "ดี"),
|
||||
|
@ -343,6 +342,6 @@ public class CategorizationAnalyzerTests extends ESTestCase {
|
|||
|
||||
public void testInvalidAnalyzer() {
|
||||
CategorizationAnalyzerConfig config = new CategorizationAnalyzerConfig.Builder().setAnalyzer("does not exist").build();
|
||||
expectThrows(IllegalArgumentException.class, () -> new CategorizationAnalyzer(analysisRegistry, environment, config));
|
||||
expectThrows(IllegalArgumentException.class, () -> new CategorizationAnalyzer(analysisRegistry, config));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -109,7 +109,7 @@ public class AbstractDataToProcessWriterTests extends ESTestCase {
|
|||
|
||||
public void testTokenizeForCategorization() throws IOException {
|
||||
CategorizationAnalyzerConfig defaultConfig = CategorizationAnalyzerConfig.buildDefaultCategorizationAnalyzer(null);
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, environment, defaultConfig)) {
|
||||
try (CategorizationAnalyzer categorizationAnalyzer = new CategorizationAnalyzer(analysisRegistry, defaultConfig)) {
|
||||
|
||||
assertEquals("sol13m-8608.1.p2ps,Info,Source,AES_SERVICE2,on,has,shut,down",
|
||||
AbstractDataToProcessWriter.tokenizeForCategorization(categorizationAnalyzer, "p2ps",
|
||||
|
|
|
@ -125,7 +125,7 @@ public class CsvDataToProcessWriterTests extends ESTestCase {
|
|||
CsvDataToProcessWriter writer = createWriter();
|
||||
writer.writeHeader();
|
||||
try (CategorizationAnalyzer categorizationAnalyzer =
|
||||
new CategorizationAnalyzer(analysisRegistry, environment, analysisConfig.getCategorizationAnalyzerConfig())) {
|
||||
new CategorizationAnalyzer(analysisRegistry, analysisConfig.getCategorizationAnalyzerConfig())) {
|
||||
writer.write(inputStream, categorizationAnalyzer, null, (r, e) -> {});
|
||||
}
|
||||
verify(dataCountsReporter, times(1)).startNewIncrementalCount();
|
||||
|
|
|
@ -124,7 +124,7 @@ public class JsonDataToProcessWriterTests extends ESTestCase {
|
|||
JsonDataToProcessWriter writer = createWriter();
|
||||
writer.writeHeader();
|
||||
try (CategorizationAnalyzer categorizationAnalyzer =
|
||||
new CategorizationAnalyzer(analysisRegistry, environment, analysisConfig.getCategorizationAnalyzerConfig())) {
|
||||
new CategorizationAnalyzer(analysisRegistry, analysisConfig.getCategorizationAnalyzerConfig())) {
|
||||
writer.write(inputStream, categorizationAnalyzer, XContentType.JSON, (r, e) -> {});
|
||||
}
|
||||
verify(dataCountsReporter, times(1)).startNewIncrementalCount();
|
||||
|
|
Loading…
Reference in New Issue