Add the ability to set an analyzer on keyword fields. (#21919)

This adds a new `normalizer` property to `keyword` fields that pre-processes the
field value prior to indexing, but without altering the `_source`. Note that
only the normalization components that work on a per-character basis are
applied, so for instance stemming filters will be ignored while lowercasing or
ascii folding will be applied.

Closes #18064
This commit is contained in:
Adrien Grand 2016-12-30 09:36:10 +01:00 committed by GitHub
parent 117b63ed41
commit 3f805d68cb
20 changed files with 827 additions and 68 deletions

View File

@ -145,11 +145,10 @@ public class MetaDataIndexUpgradeService extends AbstractComponent {
@Override
public Set<Entry<String, NamedAnalyzer>> entrySet() {
// just to ensure we can iterate over this single analzyer
return Collections.singletonMap(fakeDefault.name(), fakeDefault).entrySet();
return Collections.emptySet();
}
};
try (IndexAnalyzers fakeIndexAnalzyers = new IndexAnalyzers(indexSettings, fakeDefault, fakeDefault, fakeDefault, analyzerMap)) {
try (IndexAnalyzers fakeIndexAnalzyers = new IndexAnalyzers(indexSettings, fakeDefault, fakeDefault, fakeDefault, analyzerMap, analyzerMap)) {
MapperService mapperService = new MapperService(indexSettings, fakeIndexAnalzyers, xContentRegistry, similarityService,
mapperRegistry, () -> null);
mapperService.merge(indexMetaData, MapperService.MergeReason.MAPPING_RECOVERY, false);

View File

@ -67,17 +67,20 @@ public final class AnalysisRegistry implements Closeable {
private final Map<String, AnalysisProvider<TokenFilterFactory>> tokenFilters;
private final Map<String, AnalysisProvider<TokenizerFactory>> tokenizers;
private final Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers;
private final Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers;
public AnalysisRegistry(Environment environment,
Map<String, AnalysisProvider<CharFilterFactory>> charFilters,
Map<String, AnalysisProvider<TokenFilterFactory>> tokenFilters,
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers,
Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers) {
Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers,
Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers) {
this.environment = environment;
this.charFilters = unmodifiableMap(charFilters);
this.tokenFilters = unmodifiableMap(tokenFilters);
this.tokenizers = unmodifiableMap(tokenizers);
this.analyzers = unmodifiableMap(analyzers);
this.normalizers = unmodifiableMap(normalizers);
}
/**
@ -151,7 +154,8 @@ public final class AnalysisRegistry implements Closeable {
final Map<String, TokenizerFactory> tokenizerFactories = buildTokenizerFactories(indexSettings);
final Map<String, TokenFilterFactory> tokenFilterFactories = buildTokenFilterFactories(indexSettings);
final Map<String, AnalyzerProvider<?>> analyzierFactories = buildAnalyzerFactories(indexSettings);
return build(indexSettings, analyzierFactories, tokenizerFactories, charFilterFactories, tokenFilterFactories);
final Map<String, AnalyzerProvider<?>> normalizerFactories = buildNormalizerFactories(indexSettings);
return build(indexSettings, analyzierFactories, normalizerFactories, tokenizerFactories, charFilterFactories, tokenFilterFactories);
}
public Map<String, TokenFilterFactory> buildTokenFilterFactories(IndexSettings indexSettings) throws IOException {
@ -164,22 +168,28 @@ public final class AnalysisRegistry implements Closeable {
*/
tokenFilters.put("synonym", requriesAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
tokenFilters.put("synonym_graph", requriesAnalysisSettings((is, env, name, settings) -> new SynonymGraphFilterFactory(is, env, this, name, settings)));
return buildMapping(false, "tokenfilter", indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.tokenFilterFactories);
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.tokenFilterFactories);
}
public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
final Map<String, Settings> tokenizersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_TOKENIZER);
return buildMapping(false, "tokenizer", indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.tokenizerFactories);
return buildMapping(Component.TOKENIZER, indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.tokenizerFactories);
}
public Map<String, CharFilterFactory> buildCharFilterFactories(IndexSettings indexSettings) throws IOException {
final Map<String, Settings> charFiltersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_CHAR_FILTER);
return buildMapping(false, "charfilter", indexSettings, charFiltersSettings, charFilters, prebuiltAnalysis.charFilterFactories);
return buildMapping(Component.CHAR_FILTER, indexSettings, charFiltersSettings, charFilters, prebuiltAnalysis.charFilterFactories);
}
public Map<String, AnalyzerProvider<?>> buildAnalyzerFactories(IndexSettings indexSettings) throws IOException {
final Map<String, Settings> analyzersSettings = indexSettings.getSettings().getGroups("index.analysis.analyzer");
return buildMapping(true, "analyzer", indexSettings, analyzersSettings, analyzers, prebuiltAnalysis.analyzerProviderFactories);
return buildMapping(Component.ANALYZER, indexSettings, analyzersSettings, analyzers, prebuiltAnalysis.analyzerProviderFactories);
}
public Map<String, AnalyzerProvider<?>> buildNormalizerFactories(IndexSettings indexSettings) throws IOException {
final Map<String, Settings> noralizersSettings = indexSettings.getSettings().getGroups("index.analysis.normalizer");
// TODO: Have pre-built normalizers
return buildMapping(Component.NORMALIZER, indexSettings, noralizersSettings, normalizers, Collections.emptyMap());
}
/**
@ -194,7 +204,7 @@ public final class AnalysisRegistry implements Closeable {
final Map<String, Settings> tokenizerSettings = indexSettings.getSettings().getGroups("index.analysis.tokenizer");
if (tokenizerSettings.containsKey(tokenizer)) {
Settings currentSettings = tokenizerSettings.get(tokenizer);
return getAnalysisProvider("tokenizer", tokenizers, tokenizer, currentSettings.get("type"));
return getAnalysisProvider(Component.TOKENIZER, tokenizers, tokenizer, currentSettings.get("type"));
} else {
return getTokenizerProvider(tokenizer);
}
@ -223,7 +233,7 @@ public final class AnalysisRegistry implements Closeable {
} else if ("synonym_graph".equals(typeName)) {
return requriesAnalysisSettings((is, env, name, settings) -> new SynonymGraphFilterFactory(is, env, this, name, settings));
} else {
return getAnalysisProvider("tokenfilter", tokenFilters, tokenFilter, typeName);
return getAnalysisProvider(Component.FILTER, tokenFilters, tokenFilter, typeName);
}
} else {
return getTokenFilterProvider(tokenFilter);
@ -242,7 +252,7 @@ public final class AnalysisRegistry implements Closeable {
final Map<String, Settings> tokenFilterSettings = indexSettings.getSettings().getGroups("index.analysis.char_filter");
if (tokenFilterSettings.containsKey(charFilter)) {
Settings currentSettings = tokenFilterSettings.get(charFilter);
return getAnalysisProvider("charfilter", charFilters, charFilter, currentSettings.get("type"));
return getAnalysisProvider(Component.CHAR_FILTER, charFilters, charFilter, currentSettings.get("type"));
} else {
return getCharFilterProvider(charFilter);
}
@ -261,7 +271,40 @@ public final class AnalysisRegistry implements Closeable {
};
}
private <T> Map<String, T> buildMapping(boolean analyzer, String toBuild, IndexSettings settings, Map<String, Settings> settingsMap,
enum Component {
ANALYZER {
@Override
public String toString() {
return "analyzer";
}
},
NORMALIZER {
@Override
public String toString() {
return "normalizer";
}
},
CHAR_FILTER {
@Override
public String toString() {
return "char_filter";
}
},
TOKENIZER {
@Override
public String toString() {
return "tokenizer";
}
},
FILTER {
@Override
public String toString() {
return "filter";
}
};
}
private <T> Map<String, T> buildMapping(Component component, IndexSettings settings, Map<String, Settings> settingsMap,
Map<String, AnalysisModule.AnalysisProvider<T>> providerMap, Map<String, AnalysisModule.AnalysisProvider<T>> defaultInstance)
throws IOException {
Settings defaultSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, settings.getIndexVersionCreated()).build();
@ -270,29 +313,34 @@ public final class AnalysisRegistry implements Closeable {
String name = entry.getKey();
Settings currentSettings = entry.getValue();
String typeName = currentSettings.get("type");
if (analyzer) {
T factory;
if (component == Component.ANALYZER) {
T factory = null;
if (typeName == null) {
if (currentSettings.get("tokenizer") != null) {
factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings);
} else {
throw new IllegalArgumentException(toBuild + " [" + name + "] must specify either an analyzer type, or a tokenizer");
throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer");
}
} else if (typeName.equals("custom")) {
factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings);
} else {
AnalysisModule.AnalysisProvider<T> type = providerMap.get(typeName);
if (type == null) {
throw new IllegalArgumentException("Unknown " + toBuild + " type [" + typeName + "] for [" + name + "]");
}
factory = type.get(settings, environment, name, currentSettings);
}
factories.put(name, factory);
} else {
AnalysisProvider<T> type = getAnalysisProvider(toBuild, providerMap, name, typeName);
final T factory = type.get(settings, environment, name, currentSettings);
factories.put(name, factory);
if (factory != null) {
factories.put(name, factory);
continue;
}
} else if (component == Component.NORMALIZER) {
if (typeName == null || typeName.equals("custom")) {
T factory = (T) new CustomNormalizerProvider(settings, name, currentSettings);
factories.put(name, factory);
continue;
}
}
AnalysisProvider<T> type = getAnalysisProvider(component, providerMap, name, typeName);
if (type == null) {
throw new IllegalArgumentException("Unknown " + component + " type [" + typeName + "] for [" + name + "]");
}
final T factory = type.get(settings, environment, name, currentSettings);
factories.put(name, factory);
}
// go over the char filters in the bindings and register the ones that are not configured
@ -330,13 +378,13 @@ public final class AnalysisRegistry implements Closeable {
return factories;
}
private <T> AnalysisProvider<T> getAnalysisProvider(String toBuild, Map<String, AnalysisProvider<T>> providerMap, String name, String typeName) {
private <T> AnalysisProvider<T> getAnalysisProvider(Component component, Map<String, AnalysisProvider<T>> providerMap, String name, String typeName) {
if (typeName == null) {
throw new IllegalArgumentException(toBuild + " [" + name + "] must specify either an analyzer type, or a tokenizer");
throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer");
}
AnalysisProvider<T> type = providerMap.get(typeName);
if (type == null) {
throw new IllegalArgumentException("Unknown " + toBuild + " type [" + typeName + "] for [" + name + "]");
throw new IllegalArgumentException("Unknown " + component + " type [" + typeName + "] for [" + name + "]");
}
return type;
}
@ -426,6 +474,7 @@ public final class AnalysisRegistry implements Closeable {
public IndexAnalyzers build(IndexSettings indexSettings,
Map<String, AnalyzerProvider<?>> analyzerProviders,
Map<String, AnalyzerProvider<?>> normalizerProviders,
Map<String, TokenizerFactory> tokenizerFactoryFactories,
Map<String, CharFilterFactory> charFilterFactoryFactories,
Map<String, TokenFilterFactory> tokenFilterFactoryFactories) {
@ -436,10 +485,15 @@ public final class AnalysisRegistry implements Closeable {
DeprecationLogger deprecationLogger = new DeprecationLogger(logger);
Map<String, NamedAnalyzer> analyzerAliases = new HashMap<>();
Map<String, NamedAnalyzer> analyzers = new HashMap<>();
Map<String, NamedAnalyzer> normalizers = new HashMap<>();
for (Map.Entry<String, AnalyzerProvider<?>> entry : analyzerProviders.entrySet()) {
processAnalyzerFactory(deprecationLogger, indexSettings, entry.getKey(), entry.getValue(), analyzerAliases, analyzers,
tokenFilterFactoryFactories, charFilterFactoryFactories, tokenizerFactoryFactories);
}
for (Map.Entry<String, AnalyzerProvider<?>> entry : normalizerProviders.entrySet()) {
processNormalizerFactory(deprecationLogger, indexSettings, entry.getKey(), entry.getValue(), normalizers,
tokenFilterFactoryFactories, charFilterFactoryFactories);
}
for (Map.Entry<String, NamedAnalyzer> entry : analyzerAliases.entrySet()) {
String key = entry.getKey();
if (analyzers.containsKey(key) &&
@ -485,7 +539,7 @@ public final class AnalysisRegistry implements Closeable {
}
}
return new IndexAnalyzers(indexSettings, defaultIndexAnalyzer, defaultSearchAnalyzer, defaultSearchQuoteAnalyzer,
unmodifiableMap(analyzers));
unmodifiableMap(analyzers), unmodifiableMap(normalizers));
}
private void processAnalyzerFactory(DeprecationLogger deprecationLogger,
@ -551,4 +605,25 @@ public final class AnalysisRegistry implements Closeable {
}
}
}
private void processNormalizerFactory(DeprecationLogger deprecationLogger,
IndexSettings indexSettings,
String name,
AnalyzerProvider<?> normalizerFactory,
Map<String, NamedAnalyzer> normalizers,
Map<String, TokenFilterFactory> tokenFilters,
Map<String, CharFilterFactory> charFilters) {
if (normalizerFactory instanceof CustomNormalizerProvider) {
((CustomNormalizerProvider) normalizerFactory).build(charFilters, tokenFilters);
}
Analyzer normalizerF = normalizerFactory.get();
if (normalizerF == null) {
throw new IllegalArgumentException("normalizer [" + normalizerFactory.name() + "] created null normalizer");
}
NamedAnalyzer normalizer = new NamedAnalyzer(name, normalizerFactory.scope(), normalizerF);
if (normalizers.containsKey(name)) {
throw new IllegalStateException("already registered analyzer with name: " + name);
}
normalizers.put(name, normalizer);
}
}

View File

@ -94,4 +94,27 @@ public final class CustomAnalyzer extends Analyzer {
}
return reader;
}
@Override
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
for (CharFilterFactory charFilter : charFilters) {
if (charFilter instanceof MultiTermAwareComponent) {
charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
reader = charFilter.create(reader);
}
}
return reader;
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = in;
for (TokenFilterFactory filter : tokenFilters) {
if (filter instanceof MultiTermAwareComponent) {
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
result = filter.create(result);
}
}
return result;
}
}

View File

@ -0,0 +1,95 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.PreBuiltTokenizers;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* A custom normalizer that is built out of a char and token filters. On the
* contrary to analyzers, it does not support tokenizers and only supports a
* subset of char and token filters.
*/
public final class CustomNormalizerProvider extends AbstractIndexAnalyzerProvider<CustomAnalyzer> {
private final Settings analyzerSettings;
private CustomAnalyzer customAnalyzer;
public CustomNormalizerProvider(IndexSettings indexSettings,
String name, Settings settings) {
super(indexSettings, name, settings);
this.analyzerSettings = settings;
}
public void build(final Map<String, CharFilterFactory> charFilters, final Map<String, TokenFilterFactory> tokenFilters) {
String tokenizerName = analyzerSettings.get("tokenizer");
if (tokenizerName != null) {
throw new IllegalArgumentException("Custom normalizer [" + name() + "] cannot configure a tokenizer");
}
List<CharFilterFactory> charFiltersList = new ArrayList<>();
String[] charFilterNames = analyzerSettings.getAsArray("char_filter");
for (String charFilterName : charFilterNames) {
CharFilterFactory charFilter = charFilters.get(charFilterName);
if (charFilter == null) {
throw new IllegalArgumentException("Custom normalizer [" + name() + "] failed to find char_filter under name ["
+ charFilterName + "]");
}
if (charFilter instanceof MultiTermAwareComponent == false) {
throw new IllegalArgumentException("Custom normalizer [" + name() + "] may not use char filter ["
+ charFilterName + "]");
}
charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
charFiltersList.add(charFilter);
}
List<TokenFilterFactory> tokenFilterList = new ArrayList<>();
String[] tokenFilterNames = analyzerSettings.getAsArray("filter");
for (String tokenFilterName : tokenFilterNames) {
TokenFilterFactory tokenFilter = tokenFilters.get(tokenFilterName);
if (tokenFilter == null) {
throw new IllegalArgumentException("Custom Analyzer [" + name() + "] failed to find filter under name ["
+ tokenFilterName + "]");
}
if (tokenFilter instanceof MultiTermAwareComponent == false) {
throw new IllegalArgumentException("Custom normalizer [" + name() + "] may not use filter [" + tokenFilterName + "]");
}
tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent();
tokenFilterList.add(tokenFilter);
}
this.customAnalyzer = new CustomAnalyzer(
PreBuiltTokenizers.KEYWORD.getTokenizerFactory(indexSettings.getIndexVersionCreated()),
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()])
);
}
@Override
public CustomAnalyzer get() {
return this.customAnalyzer;
}
}

View File

@ -25,6 +25,7 @@ import org.elasticsearch.index.IndexSettings;
import java.io.Closeable;
import java.io.IOException;
import java.util.Map;
import java.util.stream.Stream;
/**
* IndexAnalyzers contains a name to analyzer mapping for a specific index.
@ -38,15 +39,18 @@ public final class IndexAnalyzers extends AbstractIndexComponent implements Clos
private final NamedAnalyzer defaultSearchAnalyzer;
private final NamedAnalyzer defaultSearchQuoteAnalyzer;
private final Map<String, NamedAnalyzer> analyzers;
private final Map<String, NamedAnalyzer> normalizers;
private final IndexSettings indexSettings;
public IndexAnalyzers(IndexSettings indexSettings, NamedAnalyzer defaultIndexAnalyzer, NamedAnalyzer defaultSearchAnalyzer,
NamedAnalyzer defaultSearchQuoteAnalyzer, Map<String, NamedAnalyzer> analyzers) {
NamedAnalyzer defaultSearchQuoteAnalyzer, Map<String, NamedAnalyzer> analyzers,
Map<String, NamedAnalyzer> normalizers) {
super(indexSettings);
this.defaultIndexAnalyzer = defaultIndexAnalyzer;
this.defaultSearchAnalyzer = defaultSearchAnalyzer;
this.defaultSearchQuoteAnalyzer = defaultSearchQuoteAnalyzer;
this.analyzers = analyzers;
this.normalizers = normalizers;
this.indexSettings = indexSettings;
}
@ -57,6 +61,12 @@ public final class IndexAnalyzers extends AbstractIndexComponent implements Clos
return analyzers.get(name);
}
/**
* Returns a normalizer mapped to the given name or <code>null</code> if not present
*/
public NamedAnalyzer getNormalizer(String name) {
return normalizers.get(name);
}
/**
* Returns the default index analyzer for this index
@ -81,7 +91,7 @@ public final class IndexAnalyzers extends AbstractIndexComponent implements Clos
@Override
public void close() throws IOException {
IOUtils.close(() -> analyzers.values().stream()
IOUtils.close(() -> Stream.concat(analyzers.values().stream(), normalizers.values().stream())
.filter(a -> a.scope() == AnalyzerScope.INDEX)
.iterator());
}

View File

@ -19,16 +19,20 @@
package org.elasticsearch.index.mapper;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.plain.DocValuesIndexFieldData;
@ -36,6 +40,7 @@ import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import static org.elasticsearch.index.mapper.TypeParsers.parseField;
@ -70,6 +75,11 @@ public final class KeywordFieldMapper extends FieldMapper {
builder = this;
}
@Override
public KeywordFieldType fieldType() {
return (KeywordFieldType) super.fieldType();
}
public Builder ignoreAbove(int ignoreAbove) {
if (ignoreAbove < 0) {
throw new IllegalArgumentException("[ignore_above] must be positive, got " + ignoreAbove);
@ -92,6 +102,12 @@ public final class KeywordFieldMapper extends FieldMapper {
return builder;
}
public Builder normalizer(NamedAnalyzer normalizer) {
fieldType().setNormalizer(normalizer);
fieldType().setSearchAnalyzer(normalizer);
return builder;
}
@Override
public KeywordFieldMapper build(BuilderContext context) {
setupFieldType(context);
@ -103,7 +119,7 @@ public final class KeywordFieldMapper extends FieldMapper {
public static class TypeParser implements Mapper.TypeParser {
@Override
public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
KeywordFieldMapper.Builder builder = new KeywordFieldMapper.Builder(name);
parseField(builder, name, node, parserContext);
for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet().iterator(); iterator.hasNext();) {
@ -125,6 +141,15 @@ public final class KeywordFieldMapper extends FieldMapper {
} else if (propName.equals("eager_global_ordinals")) {
builder.eagerGlobalOrdinals(XContentMapValues.nodeBooleanValue(propNode));
iterator.remove();
} else if (propName.equals("normalizer")) {
if (propNode != null) {
NamedAnalyzer normalizer = parserContext.getIndexAnalyzers().getNormalizer(propNode.toString());
if (normalizer == null) {
throw new MapperParsingException("normalizer [" + propNode.toString() + "] not found for field [" + name + "]");
}
builder.normalizer(normalizer);
}
iterator.remove();
}
}
return builder;
@ -133,21 +158,58 @@ public final class KeywordFieldMapper extends FieldMapper {
public static final class KeywordFieldType extends StringFieldType {
public KeywordFieldType() {}
private NamedAnalyzer normalizer = null;
public KeywordFieldType() {
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
setSearchAnalyzer(Lucene.KEYWORD_ANALYZER);
}
protected KeywordFieldType(KeywordFieldType ref) {
super(ref);
this.normalizer = ref.normalizer;
}
public KeywordFieldType clone() {
return new KeywordFieldType(this);
}
@Override
public boolean equals(Object o) {
if (super.equals(o) == false) {
return false;
}
return Objects.equals(normalizer, ((KeywordFieldType) o).normalizer);
}
@Override
public void checkCompatibility(MappedFieldType otherFT, List<String> conflicts, boolean strict) {
super.checkCompatibility(otherFT, conflicts, strict);
KeywordFieldType other = (KeywordFieldType) otherFT;
if (Objects.equals(normalizer, other.normalizer) == false) {
conflicts.add("mapper [" + name() + "] has different [normalizer]");
}
}
@Override
public int hashCode() {
return 31 * super.hashCode() + Objects.hashCode(normalizer);
}
@Override
public String typeName() {
return CONTENT_TYPE;
}
public NamedAnalyzer normalizer() {
return normalizer;
}
public void setNormalizer(NamedAnalyzer normalizer) {
checkIfFrozen();
this.normalizer = normalizer;
}
@Override
public Query nullValueQuery() {
if (nullValue() == null) {
@ -171,13 +233,25 @@ public final class KeywordFieldMapper extends FieldMapper {
BytesRef binaryValue = (BytesRef) value;
return binaryValue.utf8ToString();
}
@Override
protected BytesRef indexedValueForSearch(Object value) {
if (value == null) {
return null;
}
if (value instanceof BytesRef) {
value = ((BytesRef) value).utf8ToString();
}
return searchAnalyzer().normalize(name(), value.toString());
}
}
private Boolean includeInAll;
private int ignoreAbove;
protected KeywordFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
int ignoreAbove, Boolean includeInAll, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
int ignoreAbove, Boolean includeInAll,
Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0;
this.ignoreAbove = ignoreAbove;
@ -196,6 +270,11 @@ public final class KeywordFieldMapper extends FieldMapper {
return (KeywordFieldMapper) super.clone();
}
@Override
public KeywordFieldType fieldType() {
return (KeywordFieldType) super.fieldType();
}
// pkg-private for testing
Boolean includeInAll() {
return includeInAll;
@ -203,7 +282,7 @@ public final class KeywordFieldMapper extends FieldMapper {
@Override
protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException {
final String value;
String value;
if (context.externalValueSet()) {
value = context.externalValue().toString();
} else {
@ -219,6 +298,27 @@ public final class KeywordFieldMapper extends FieldMapper {
return;
}
final NamedAnalyzer normalizer = fieldType().normalizer();
if (normalizer != null) {
try (final TokenStream ts = normalizer.tokenStream(name(), value)) {
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
if (ts.incrementToken() == false) {
throw new IllegalStateException("The normalization token stream is "
+ "expected to produce exactly 1 token, but got 0 for analyzer "
+ normalizer + " and input \"" + value + "\"");
}
final String newValue = termAtt.toString();
if (ts.incrementToken()) {
throw new IllegalStateException("The normalization token stream is "
+ "expected to produce exactly 1 token, but got 2+ for analyzer "
+ normalizer + " and input \"" + value + "\"");
}
ts.end();
value = newValue;
}
}
if (context.includeInAll(includeInAll, this)) {
context.allEntries().addText(fieldType().name(), value, fieldType().boost());
}
@ -263,5 +363,11 @@ public final class KeywordFieldMapper extends FieldMapper {
if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) {
builder.field("ignore_above", ignoreAbove);
}
if (fieldType().normalizer() != null) {
builder.field("normalizer", fieldType().normalizer().name());
} else if (includeDefaults) {
builder.nullField("normalizer");
}
}
}

View File

@ -170,8 +170,9 @@ public final class AnalysisModule {
NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = setupTokenFilters(plugins, hunspellService);
NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = setupTokenizers(plugins);
NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> analyzers = setupAnalyzers(plugins);
NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> normalizers = setupNormalizers(plugins);
analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers
.getRegistry(), analyzers.getRegistry());
.getRegistry(), analyzers.getRegistry(), normalizers.getRegistry());
}
HunspellService getHunspellService() {
@ -334,6 +335,13 @@ public final class AnalysisModule {
return analyzers;
}
private NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> setupNormalizers(List<AnalysisPlugin> plugins) {
NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> normalizers = new NamedRegistry<>("normalizer");
// TODO: provide built-in normalizer providers?
// TODO: pluggability?
return normalizers;
}
private static <T> AnalysisModule.AnalysisProvider<T> requriesAnalysisSettings(AnalysisModule.AnalysisProvider<T> provider) {
return new AnalysisModule.AnalysisProvider<T>() {
@Override

View File

@ -435,7 +435,7 @@ public class GatewayIndexStateIT extends ESIntegTestCase {
assertEquals(ex.getMessage(), "Failed to verify index " + metaData.getIndex());
assertNotNull(ex.getCause());
assertEquals(IllegalArgumentException.class, ex.getCause().getClass());
assertEquals(ex.getCause().getMessage(), "Unknown tokenfilter type [icu_collation] for [myCollator]");
assertEquals(ex.getCause().getMessage(), "Unknown filter type [icu_collation] for [myCollator]");
}
/**

View File

@ -148,7 +148,7 @@ public class IndexModuleTests extends ESTestCase {
public void testWrapperIsBound() throws IOException {
IndexModule module = new IndexModule(indexSettings,
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
module.setSearcherWrapper((s) -> new Wrapper());
module.engineFactory.set(new MockEngineFactory(AssertingDirectoryReader.class));
@ -168,7 +168,7 @@ public class IndexModuleTests extends ESTestCase {
.build();
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(index, settings);
IndexModule module = new IndexModule(indexSettings,
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
module.addIndexStore("foo_store", FooStore::new);
try {
module.addIndexStore("foo_store", FooStore::new);
@ -193,7 +193,7 @@ public class IndexModuleTests extends ESTestCase {
};
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(index, settings);
IndexModule module = new IndexModule(indexSettings,
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
module.addIndexEventListener(eventListener);
IndexService indexService = newIndexService(module);
IndexSettings x = indexService.getIndexSettings();
@ -208,7 +208,7 @@ public class IndexModuleTests extends ESTestCase {
public void testListener() throws IOException {
Setting<Boolean> booleanSetting = Setting.boolSetting("index.foo.bar", false, Property.Dynamic, Property.IndexScope);
IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings, booleanSetting),
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
Setting<Boolean> booleanSetting2 = Setting.boolSetting("index.foo.bar.baz", false, Property.Dynamic, Property.IndexScope);
AtomicBoolean atomicBoolean = new AtomicBoolean(false);
module.addSettingsUpdateConsumer(booleanSetting, atomicBoolean::set);
@ -228,7 +228,7 @@ public class IndexModuleTests extends ESTestCase {
public void testAddIndexOperationListener() throws IOException {
IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings),
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
AtomicBoolean executed = new AtomicBoolean(false);
IndexingOperationListener listener = new IndexingOperationListener() {
@Override
@ -257,7 +257,7 @@ public class IndexModuleTests extends ESTestCase {
public void testAddSearchOperationListener() throws IOException {
IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings),
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
AtomicBoolean executed = new AtomicBoolean(false);
SearchOperationListener listener = new SearchOperationListener() {
@ -291,7 +291,7 @@ public class IndexModuleTests extends ESTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings),
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
module.addSimilarity("test_similarity", (string, settings) -> new SimilarityProvider() {
@Override
public String name() {
@ -315,7 +315,7 @@ public class IndexModuleTests extends ESTestCase {
public void testFrozen() {
IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings),
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
module.freeze();
String msg = "Can't modify IndexModule once the index service has been created";
assertEquals(msg, expectThrows(IllegalStateException.class, () -> module.addSearchOperationListener(null)).getMessage());
@ -334,7 +334,7 @@ public class IndexModuleTests extends ESTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings),
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
Exception ex = expectThrows(IllegalArgumentException.class, () -> newIndexService(module));
assertEquals("Unknown Similarity type [test_similarity] for [my_similarity]", ex.getMessage());
}
@ -346,7 +346,7 @@ public class IndexModuleTests extends ESTestCase {
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.build();
IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings),
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
Exception ex = expectThrows(IllegalArgumentException.class, () -> newIndexService(module));
assertEquals("Similarity [my_similarity] must have an associated type", ex.getMessage());
}
@ -356,7 +356,7 @@ public class IndexModuleTests extends ESTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings),
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
module.forceQueryCacheProvider((a, b) -> new CustomQueryCache());
expectThrows(AlreadySetException.class, () -> module.forceQueryCacheProvider((a, b) -> new CustomQueryCache()));
IndexService indexService = newIndexService(module);
@ -369,7 +369,7 @@ public class IndexModuleTests extends ESTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings),
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
IndexService indexService = newIndexService(module);
assertTrue(indexService.cache().query() instanceof IndexQueryCache);
indexService.close("simon says", false);
@ -381,7 +381,7 @@ public class IndexModuleTests extends ESTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings),
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap()));
new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
module.forceQueryCacheProvider((a, b) -> new CustomQueryCache());
IndexService indexService = newIndexService(module);
assertTrue(indexService.cache().query() instanceof DisabledQueryCache);

View File

@ -65,7 +65,7 @@ public class AnalysisRegistryTests extends ESTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
registry = new AnalysisRegistry(new Environment(settings),
emptyMap(), emptyMap(), emptyMap(), emptyMap());
emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap());
}
public void testDefaultAnalyzers() throws IOException {
@ -76,7 +76,8 @@ public class AnalysisRegistryTests extends ESTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap())
IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings),
emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())
.build(idxSettings);
assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class));
assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class));
@ -88,7 +89,7 @@ public class AnalysisRegistryTests extends ESTestCase {
Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build();
IndexAnalyzers indexAnalyzers = registry.build(IndexSettingsModule.newIndexSettings("index", settings),
singletonMap("default", analyzerProvider("default"))
, emptyMap(), emptyMap(), emptyMap());
, emptyMap(), emptyMap(), emptyMap(), emptyMap());
assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
@ -100,7 +101,7 @@ public class AnalysisRegistryTests extends ESTestCase {
AnalyzerProvider<?> defaultIndex = new PreBuiltAnalyzerProvider("default_index", AnalyzerScope.INDEX, new EnglishAnalyzer());
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> registry.build(IndexSettingsModule.newIndexSettings("index", settings),
singletonMap("default_index", defaultIndex), emptyMap(), emptyMap(), emptyMap()));
singletonMap("default_index", defaultIndex), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
assertTrue(e.getMessage().contains("[index.analysis.analyzer.default_index] is not supported"));
}
@ -109,7 +110,7 @@ public class AnalysisRegistryTests extends ESTestCase {
VersionUtils.getPreviousVersion(Version.V_5_0_0_alpha1));
Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build();
IndexAnalyzers indexAnalyzers = registry.build(IndexSettingsModule.newIndexSettings("index", settings),
singletonMap("default_index", analyzerProvider("default_index")), emptyMap(), emptyMap(), emptyMap());
singletonMap("default_index", analyzerProvider("default_index")), emptyMap(), emptyMap(), emptyMap(), emptyMap());
assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class));
assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class));
@ -121,7 +122,7 @@ public class AnalysisRegistryTests extends ESTestCase {
Version version = VersionUtils.randomVersion(random());
Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build();
IndexAnalyzers indexAnalyzers = registry.build(IndexSettingsModule.newIndexSettings("index", settings),
singletonMap("default_search", analyzerProvider("default_search")), emptyMap(), emptyMap(), emptyMap());
singletonMap("default_search", analyzerProvider("default_search")), emptyMap(), emptyMap(), emptyMap(), emptyMap());
assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class));
assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
@ -135,7 +136,7 @@ public class AnalysisRegistryTests extends ESTestCase {
analyzers.put("default_index", analyzerProvider("default_index"));
analyzers.put("default_search", analyzerProvider("default_search"));
IndexAnalyzers indexAnalyzers = registry.build(IndexSettingsModule.newIndexSettings("index", settings),
analyzers, Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap());
analyzers, Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap());
assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
@ -196,10 +197,11 @@ public class AnalysisRegistryTests extends ESTestCase {
Settings indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap())
IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings),
emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())
.build(idxSettings);
IndexAnalyzers otherIndexAnalyzers = new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(),
emptyMap()).build(idxSettings);
emptyMap(), emptyMap()).build(idxSettings);
final int numIters = randomIntBetween(5, 20);
for (int i = 0; i < numIters; i++) {
PreBuiltAnalyzers preBuiltAnalyzers = RandomPicks.randomFrom(random(), PreBuiltAnalyzers.values());
@ -219,7 +221,8 @@ public class AnalysisRegistryTests extends ESTestCase {
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap()).build(idxSettings));
() -> new AnalysisRegistry(new Environment(settings),
emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()).build(idxSettings));
assertThat(e.getMessage(), equalTo("analyzer [test_analyzer] must specify either an analyzer type, or a tokenizer"));
}
@ -228,7 +231,8 @@ public class AnalysisRegistryTests extends ESTestCase {
Settings indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap())
IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings),
emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())
.build(idxSettings);
indexAnalyzers.close();
indexAnalyzers.close();

View File

@ -0,0 +1,102 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
import java.io.IOException;
public class CustomNormalizerTests extends ESTokenStreamTestCase {
public void testBasics() throws IOException {
Settings settings = Settings.builder()
.putArray("index.analysis.normalizer.my_normalizer.filter", "lowercase", "asciifolding")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
assertNull(analysis.indexAnalyzers.get("my_normalizer"));
NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer");
assertNotNull(normalizer);
assertEquals("my_normalizer", normalizer.name());
assertTokenStreamContents(normalizer.tokenStream("foo", "Cet été-là"), new String[] {"cet ete-la"});
assertEquals(new BytesRef("cet ete-la"), normalizer.normalize("foo", "Cet été-là"));
}
public void testUnknownType() {
Settings settings = Settings.builder()
.put("index.analysis.normalizer.my_normalizer.type", "foobar")
.putArray("index.analysis.normalizer.my_normalizer.filter", "lowercase", "asciifolding")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings));
assertEquals("Unknown normalizer type [foobar] for [my_normalizer]", e.getMessage());
}
public void testTokenizer() throws IOException {
Settings settings = Settings.builder()
.put("index.analysis.normalizer.my_normalizer.tokenizer", "keyword")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings));
assertEquals("Custom normalizer [my_normalizer] cannot configure a tokenizer", e.getMessage());
}
public void testCharFilters() throws IOException {
Settings settings = Settings.builder()
.put("index.analysis.char_filter.my_mapping.type", "mapping")
.putArray("index.analysis.char_filter.my_mapping.mappings", "a => z")
.putArray("index.analysis.normalizer.my_normalizer.char_filter", "my_mapping")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
assertNull(analysis.indexAnalyzers.get("my_normalizer"));
NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer");
assertNotNull(normalizer);
assertEquals("my_normalizer", normalizer.name());
assertTokenStreamContents(normalizer.tokenStream("foo", "abc"), new String[] {"zbc"});
assertEquals(new BytesRef("zbc"), normalizer.normalize("foo", "abc"));
}
public void testIllegalFilters() throws IOException {
Settings settings = Settings.builder()
.putArray("index.analysis.normalizer.my_normalizer.filter", "porter_stem")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings));
assertEquals("Custom normalizer [my_normalizer] may not use filter [porter_stem]", e.getMessage());
}
public void testIllegalCharFilters() throws IOException {
Settings settings = Settings.builder()
.putArray("index.analysis.normalizer.my_normalizer.char_filter", "html_strip")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings));
assertEquals("Custom normalizer [my_normalizer] may not use char filter [html_strip]", e.getMessage());
}
}

View File

@ -2313,7 +2313,7 @@ public class InternalEngineTests extends ESTestCase {
Index index = new Index(indexName, "_na_");
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(index, settings);
NamedAnalyzer defaultAnalyzer = new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer());
IndexAnalyzers indexAnalyzers = new IndexAnalyzers(indexSettings, defaultAnalyzer, defaultAnalyzer, defaultAnalyzer, Collections.emptyMap());
IndexAnalyzers indexAnalyzers = new IndexAnalyzers(indexSettings, defaultAnalyzer, defaultAnalyzer, defaultAnalyzer, Collections.emptyMap(), Collections.emptyMap());
SimilarityService similarityService = new SimilarityService(indexSettings, Collections.emptyMap());
MapperRegistry mapperRegistry = new IndicesModule(Collections.emptyList()).getMapperRegistry();
mapperService = new MapperService(indexSettings, indexAnalyzers, xContentRegistry, similarityService, mapperRegistry,

View File

@ -25,8 +25,10 @@ import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.mapper.MapperService.MergeReason;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.elasticsearch.test.InternalSettingsPlugin;
@ -51,7 +53,11 @@ public class KeywordFieldMapperTests extends ESSingleNodeTestCase {
@Before
public void setup() {
indexService = createIndex("test");
indexService = createIndex("test", Settings.builder()
.put("index.analysis.normalizer.my_lowercase.type", "custom")
.putArray("index.analysis.normalizer.my_lowercase.filter", "lowercase")
.put("index.analysis.normalizer.my_asciifolding.type", "custom")
.putArray("index.analysis.normalizer.my_asciifolding.filter", "asciifolding").build());
parser = indexService.mapperService().documentMapperParser();
}
@ -283,6 +289,62 @@ public class KeywordFieldMapperTests extends ESSingleNodeTestCase {
assertFalse(fields[0].fieldType().omitNorms());
}
public void testNormalizer() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field")
.field("type", "keyword").field("normalizer", "my_lowercase").endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", "AbC")
.endObject()
.bytes());
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
assertEquals(new BytesRef("abc"), fields[0].binaryValue());
IndexableFieldType fieldType = fields[0].fieldType();
assertThat(fieldType.omitNorms(), equalTo(true));
assertFalse(fieldType.tokenized());
assertFalse(fieldType.stored());
assertThat(fieldType.indexOptions(), equalTo(IndexOptions.DOCS));
assertThat(fieldType.storeTermVectors(), equalTo(false));
assertThat(fieldType.storeTermVectorOffsets(), equalTo(false));
assertThat(fieldType.storeTermVectorPositions(), equalTo(false));
assertThat(fieldType.storeTermVectorPayloads(), equalTo(false));
assertEquals(DocValuesType.NONE, fieldType.docValuesType());
assertEquals(new BytesRef("abc"), fields[1].binaryValue());
fieldType = fields[1].fieldType();
assertThat(fieldType.indexOptions(), equalTo(IndexOptions.NONE));
assertEquals(DocValuesType.SORTED_SET, fieldType.docValuesType());
}
public void testUpdateNormalizer() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field")
.field("type", "keyword").field("normalizer", "my_lowercase").endObject().endObject()
.endObject().endObject().string();
indexService.mapperService().merge("type", new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE, randomBoolean());
String mapping2 = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field")
.field("type", "keyword").field("normalizer", "my_asciifolding").endObject().endObject()
.endObject().endObject().string();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> indexService.mapperService().merge("type",
new CompressedXContent(mapping2), MergeReason.MAPPING_UPDATE, randomBoolean()));
assertEquals(
"Mapper for [field] conflicts with existing mapping in other types:\n[mapper [field] has different [normalizer]]",
e.getMessage());
}
public void testEmptyName() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject()
.startObject("type")

View File

@ -20,22 +20,41 @@ package org.elasticsearch.index.mapper;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermQuery;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.index.mapper.KeywordFieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.analysis.AnalyzerScope;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.mapper.KeywordFieldMapper.KeywordFieldType;
import org.elasticsearch.index.mapper.MappedFieldType.Relation;
import org.junit.Before;
import java.io.IOException;
import java.util.Arrays;
public class KeywordFieldTypeTests extends FieldTypeTestCase {
@Before
public void setupProperties() {
addModifier(new Modifier("normalizer", false) {
@Override
public void modify(MappedFieldType ft) {
((KeywordFieldType) ft).setNormalizer(Lucene.KEYWORD_ANALYZER);
}
});
}
@Override
protected MappedFieldType createDefaultFieldType() {
return new KeywordFieldMapper.KeywordFieldType();
@ -62,6 +81,31 @@ public class KeywordFieldTypeTests extends FieldTypeTestCase {
assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
}
public void testTermQueryWithNormalizer() {
MappedFieldType ft = createDefaultFieldType();
ft.setName("field");
ft.setIndexOptions(IndexOptions.DOCS);
Analyzer normalizer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer in = new WhitespaceTokenizer();
TokenFilter out = new LowerCaseFilter(in);
return new TokenStreamComponents(in, out);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
return new LowerCaseFilter(in);
}
};
ft.setSearchAnalyzer(new NamedAnalyzer("my_normalizer", AnalyzerScope.INDEX, normalizer));
assertEquals(new TermQuery(new Term("field", "foo bar")), ft.termQuery("fOo BaR", null));
ft.setIndexOptions(IndexOptions.NONE);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> ft.termQuery("bar", null));
assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
}
public void testTermsQuery() {
MappedFieldType ft = createDefaultFieldType();
ft.setName("field");

View File

@ -101,7 +101,7 @@ public class ParentFieldMapperTests extends ESSingleNodeTestCase {
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(index, Settings.EMPTY);
NamedAnalyzer namedAnalyzer = new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer());
IndexAnalyzers indexAnalyzers = new IndexAnalyzers(indexSettings, namedAnalyzer, namedAnalyzer, namedAnalyzer,
Collections.emptyMap());
Collections.emptyMap(), Collections.emptyMap());
SimilarityService similarityService = new SimilarityService(indexSettings, Collections.emptyMap());
MapperService mapperService = new MapperService(indexSettings, indexAnalyzers, xContentRegistry(), similarityService,
new IndicesModule(emptyList()).getMapperRegistry(), () -> null);

View File

@ -112,6 +112,8 @@ include::analysis/testing.asciidoc[]
include::analysis/analyzers.asciidoc[]
include::analysis/normalizers.asciidoc[]
include::analysis/tokenizers.asciidoc[]
include::analysis/tokenfilters.asciidoc[]

View File

@ -0,0 +1,57 @@
[[analysis-normalizers]]
== Normalizers
experimental[]
Normalizers are similar to analyzers except that they may only emit a single
token. As a consequence, they do not have a tokenizer and only accept a subset
of the available char filters and token filters. Only the filters that work on
a per-character basis are allowed. For instance a lowercasing filter would be
allowed, but not a stemming filter, which needs to look at the keyword as a
whole.
[float]
=== Custom analyzers
Elasticsearch does not ship with built-in normalizers so far, so the only way
to get one is by building a custom one. Custom normalizers take a list of char
<<analysis-charfilters, character filters>> and a list of
<<analysis-tokenfilters,token filters>>.
[source,js]
--------------------------------
PUT index
{
"settings": {
"analysis": {
"char_filter": {
"quote": {
"type": "mapping",
"mappings": [
"« => \"",
"» => \""
]
}
},
"normalizer": {
"my_normalizer": {
"type": "custom",
"char_filter": ["quote"],
"filter": ["lowercase", "asciifolding"]
}
}
}
},
"mappings": {
"type": {
"properties": {
"foo": {
"type": "keyword",
"normalizer": "my_normalizer"
}
}
}
}
}
--------------------------------
// CONSOLE

View File

@ -8,6 +8,7 @@ parameters that are used by <<mapping-types,field mappings>>:
The following mapping parameters are common to some or all field datatypes:
* <<analyzer,`analyzer`>>
* <<normalizer, `normalizer`>>
* <<mapping-boost,`boost`>>
* <<coerce,`coerce`>>
* <<copy-to,`copy_to`>>
@ -34,6 +35,8 @@ The following mapping parameters are common to some or all field datatypes:
include::params/analyzer.asciidoc[]
include::params/normalizer.asciidoc[]
include::params/boost.asciidoc[]
include::params/coerce.asciidoc[]

View File

@ -0,0 +1,163 @@
[[normalizer]]
=== `normalizer`
The `normalizer` property of <<keyword,`keyword`>> fields is similar to
<<analyzer,`analyzer`>> except that it guarantees that the analysis chain
produces a single token.
The `normalizer` is applied prior to indexing the keyword, as well as at
search-time when the `keyword` field is searched via a query parser such as
the <<query-dsl-match-query,`match`>> query.
[source,js]
--------------------------------
PUT index
{
"settings": {
"analysis": {
"normalizer": {
"my_normalizer": {
"type": "custom",
"char_filter": [],
"filter": ["lowercase", "asciifolding"]
}
}
}
},
"mappings": {
"type": {
"properties": {
"foo": {
"type": "keyword",
"normalizer": "my_normalizer"
}
}
}
}
}
PUT index/type/1
{
"foo": "BÀR"
}
PUT index/type/2
{
"foo": "bar"
}
PUT index/type/3
{
"foo": "baz"
}
POST index/_refresh
GET index/_search
{
"query": {
"match": {
"foo": "BAR"
}
}
}
--------------------------------
// CONSOLE
The above query matches documents 1 and 2 since `BÀR` is converted to `bar` at
both index and query time.
[source,js]
----------------------------
{
"took": $body.took,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0.2876821,
"hits": [
{
"_index": "index",
"_type": "type",
"_id": "2",
"_score": 0.2876821,
"_source": {
"foo": "bar"
}
},
{
"_index": "index",
"_type": "type",
"_id": "1",
"_score": 0.2876821,
"_source": {
"foo": "BÀR"
}
}
]
}
}
----------------------------
// TESTRESPONSE[s/"took".*/"took": "$body.took",/]
Also, the fact that keywords are converted prior to indexing also means that
aggregations return normalized values:
[source,js]
----------------------------
GET index/_search
{
"size": 0,
"aggs": {
"foo_terms": {
"terms": {
"field": "foo"
}
}
}
}
--------------------------------
// CONSOLE
// TEST[continued]
returns
[source,js]
----------------------------
{
"took": 43,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0.0,
"hits": []
},
"aggregations": {
"foo_terms": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "bar",
"doc_count": 2
},
{
"key": "baz",
"doc_count": 1
}
]
}
}
}
----------------------------
// TESTRESPONSE[s/"took".*/"took": "$body.took",/]

View File

@ -109,6 +109,12 @@ The following parameters are accepted by `keyword` fields:
Which scoring algorithm or _similarity_ should be used. Defaults
to `classic`, which uses TF/IDF.
<<normalizer,`normalizer`>>::
experimental[]
How to pre-process the keyword prior to indexing. Defaults to `null`,
meaning the keyword is kept as-is.
NOTE: Indexes imported from 2.x do not support `keyword`. Instead they will
attempt to downgrade `keyword` into `string`. This allows you to merge modern
mappings with legacy mappings. Long lived indexes will have to be recreated