From 3f805d68cb9047e8cd77654561dd0f5e5bba5379 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 30 Dec 2016 09:36:10 +0100 Subject: [PATCH] Add the ability to set an analyzer on keyword fields. (#21919) This adds a new `normalizer` property to `keyword` fields that pre-processes the field value prior to indexing, but without altering the `_source`. Note that only the normalization components that work on a per-character basis are applied, so for instance stemming filters will be ignored while lowercasing or ascii folding will be applied. Closes #18064 --- .../metadata/MetaDataIndexUpgradeService.java | 5 +- .../index/analysis/AnalysisRegistry.java | 131 +++++++++++--- .../index/analysis/CustomAnalyzer.java | 23 +++ .../analysis/CustomNormalizerProvider.java | 95 ++++++++++ .../index/analysis/IndexAnalyzers.java | 14 +- .../index/mapper/KeywordFieldMapper.java | 114 +++++++++++- .../indices/analysis/AnalysisModule.java | 10 +- .../gateway/GatewayIndexStateIT.java | 2 +- .../elasticsearch/index/IndexModuleTests.java | 26 +-- .../index/analysis/AnalysisRegistryTests.java | 26 +-- .../index/analysis/CustomNormalizerTests.java | 102 +++++++++++ .../index/engine/InternalEngineTests.java | 2 +- .../index/mapper/KeywordFieldMapperTests.java | 64 ++++++- .../index/mapper/KeywordFieldTypeTests.java | 48 +++++- .../index/mapper/ParentFieldMapperTests.java | 2 +- docs/reference/analysis.asciidoc | 2 + docs/reference/analysis/normalizers.asciidoc | 57 ++++++ docs/reference/mapping/params.asciidoc | 3 + .../mapping/params/normalizer.asciidoc | 163 ++++++++++++++++++ docs/reference/mapping/types/keyword.asciidoc | 6 + 20 files changed, 827 insertions(+), 68 deletions(-) create mode 100644 core/src/main/java/org/elasticsearch/index/analysis/CustomNormalizerProvider.java create mode 100644 core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java create mode 100644 docs/reference/analysis/normalizers.asciidoc create mode 100644 docs/reference/mapping/params/normalizer.asciidoc diff --git a/core/src/main/java/org/elasticsearch/cluster/metadata/MetaDataIndexUpgradeService.java b/core/src/main/java/org/elasticsearch/cluster/metadata/MetaDataIndexUpgradeService.java index 1779699d448..614d12547fc 100644 --- a/core/src/main/java/org/elasticsearch/cluster/metadata/MetaDataIndexUpgradeService.java +++ b/core/src/main/java/org/elasticsearch/cluster/metadata/MetaDataIndexUpgradeService.java @@ -145,11 +145,10 @@ public class MetaDataIndexUpgradeService extends AbstractComponent { @Override public Set> entrySet() { - // just to ensure we can iterate over this single analzyer - return Collections.singletonMap(fakeDefault.name(), fakeDefault).entrySet(); + return Collections.emptySet(); } }; - try (IndexAnalyzers fakeIndexAnalzyers = new IndexAnalyzers(indexSettings, fakeDefault, fakeDefault, fakeDefault, analyzerMap)) { + try (IndexAnalyzers fakeIndexAnalzyers = new IndexAnalyzers(indexSettings, fakeDefault, fakeDefault, fakeDefault, analyzerMap, analyzerMap)) { MapperService mapperService = new MapperService(indexSettings, fakeIndexAnalzyers, xContentRegistry, similarityService, mapperRegistry, () -> null); mapperService.merge(indexMetaData, MapperService.MergeReason.MAPPING_RECOVERY, false); diff --git a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index 87f9692f625..25ef5d1885f 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -67,17 +67,20 @@ public final class AnalysisRegistry implements Closeable { private final Map> tokenFilters; private final Map> tokenizers; private final Map>> analyzers; + private final Map>> normalizers; public AnalysisRegistry(Environment environment, Map> charFilters, Map> tokenFilters, Map> tokenizers, - Map>> analyzers) { + Map>> analyzers, + Map>> normalizers) { this.environment = environment; this.charFilters = unmodifiableMap(charFilters); this.tokenFilters = unmodifiableMap(tokenFilters); this.tokenizers = unmodifiableMap(tokenizers); this.analyzers = unmodifiableMap(analyzers); + this.normalizers = unmodifiableMap(normalizers); } /** @@ -151,7 +154,8 @@ public final class AnalysisRegistry implements Closeable { final Map tokenizerFactories = buildTokenizerFactories(indexSettings); final Map tokenFilterFactories = buildTokenFilterFactories(indexSettings); final Map> analyzierFactories = buildAnalyzerFactories(indexSettings); - return build(indexSettings, analyzierFactories, tokenizerFactories, charFilterFactories, tokenFilterFactories); + final Map> normalizerFactories = buildNormalizerFactories(indexSettings); + return build(indexSettings, analyzierFactories, normalizerFactories, tokenizerFactories, charFilterFactories, tokenFilterFactories); } public Map buildTokenFilterFactories(IndexSettings indexSettings) throws IOException { @@ -164,22 +168,28 @@ public final class AnalysisRegistry implements Closeable { */ tokenFilters.put("synonym", requriesAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings))); tokenFilters.put("synonym_graph", requriesAnalysisSettings((is, env, name, settings) -> new SynonymGraphFilterFactory(is, env, this, name, settings))); - return buildMapping(false, "tokenfilter", indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.tokenFilterFactories); + return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.tokenFilterFactories); } public Map buildTokenizerFactories(IndexSettings indexSettings) throws IOException { final Map tokenizersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_TOKENIZER); - return buildMapping(false, "tokenizer", indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.tokenizerFactories); + return buildMapping(Component.TOKENIZER, indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.tokenizerFactories); } public Map buildCharFilterFactories(IndexSettings indexSettings) throws IOException { final Map charFiltersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_CHAR_FILTER); - return buildMapping(false, "charfilter", indexSettings, charFiltersSettings, charFilters, prebuiltAnalysis.charFilterFactories); + return buildMapping(Component.CHAR_FILTER, indexSettings, charFiltersSettings, charFilters, prebuiltAnalysis.charFilterFactories); } public Map> buildAnalyzerFactories(IndexSettings indexSettings) throws IOException { final Map analyzersSettings = indexSettings.getSettings().getGroups("index.analysis.analyzer"); - return buildMapping(true, "analyzer", indexSettings, analyzersSettings, analyzers, prebuiltAnalysis.analyzerProviderFactories); + return buildMapping(Component.ANALYZER, indexSettings, analyzersSettings, analyzers, prebuiltAnalysis.analyzerProviderFactories); + } + + public Map> buildNormalizerFactories(IndexSettings indexSettings) throws IOException { + final Map noralizersSettings = indexSettings.getSettings().getGroups("index.analysis.normalizer"); + // TODO: Have pre-built normalizers + return buildMapping(Component.NORMALIZER, indexSettings, noralizersSettings, normalizers, Collections.emptyMap()); } /** @@ -194,7 +204,7 @@ public final class AnalysisRegistry implements Closeable { final Map tokenizerSettings = indexSettings.getSettings().getGroups("index.analysis.tokenizer"); if (tokenizerSettings.containsKey(tokenizer)) { Settings currentSettings = tokenizerSettings.get(tokenizer); - return getAnalysisProvider("tokenizer", tokenizers, tokenizer, currentSettings.get("type")); + return getAnalysisProvider(Component.TOKENIZER, tokenizers, tokenizer, currentSettings.get("type")); } else { return getTokenizerProvider(tokenizer); } @@ -223,7 +233,7 @@ public final class AnalysisRegistry implements Closeable { } else if ("synonym_graph".equals(typeName)) { return requriesAnalysisSettings((is, env, name, settings) -> new SynonymGraphFilterFactory(is, env, this, name, settings)); } else { - return getAnalysisProvider("tokenfilter", tokenFilters, tokenFilter, typeName); + return getAnalysisProvider(Component.FILTER, tokenFilters, tokenFilter, typeName); } } else { return getTokenFilterProvider(tokenFilter); @@ -242,7 +252,7 @@ public final class AnalysisRegistry implements Closeable { final Map tokenFilterSettings = indexSettings.getSettings().getGroups("index.analysis.char_filter"); if (tokenFilterSettings.containsKey(charFilter)) { Settings currentSettings = tokenFilterSettings.get(charFilter); - return getAnalysisProvider("charfilter", charFilters, charFilter, currentSettings.get("type")); + return getAnalysisProvider(Component.CHAR_FILTER, charFilters, charFilter, currentSettings.get("type")); } else { return getCharFilterProvider(charFilter); } @@ -261,7 +271,40 @@ public final class AnalysisRegistry implements Closeable { }; } - private Map buildMapping(boolean analyzer, String toBuild, IndexSettings settings, Map settingsMap, + enum Component { + ANALYZER { + @Override + public String toString() { + return "analyzer"; + } + }, + NORMALIZER { + @Override + public String toString() { + return "normalizer"; + } + }, + CHAR_FILTER { + @Override + public String toString() { + return "char_filter"; + } + }, + TOKENIZER { + @Override + public String toString() { + return "tokenizer"; + } + }, + FILTER { + @Override + public String toString() { + return "filter"; + } + }; + } + + private Map buildMapping(Component component, IndexSettings settings, Map settingsMap, Map> providerMap, Map> defaultInstance) throws IOException { Settings defaultSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, settings.getIndexVersionCreated()).build(); @@ -270,29 +313,34 @@ public final class AnalysisRegistry implements Closeable { String name = entry.getKey(); Settings currentSettings = entry.getValue(); String typeName = currentSettings.get("type"); - if (analyzer) { - T factory; + if (component == Component.ANALYZER) { + T factory = null; if (typeName == null) { if (currentSettings.get("tokenizer") != null) { factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings); } else { - throw new IllegalArgumentException(toBuild + " [" + name + "] must specify either an analyzer type, or a tokenizer"); + throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer"); } } else if (typeName.equals("custom")) { factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings); - } else { - AnalysisModule.AnalysisProvider type = providerMap.get(typeName); - if (type == null) { - throw new IllegalArgumentException("Unknown " + toBuild + " type [" + typeName + "] for [" + name + "]"); - } - factory = type.get(settings, environment, name, currentSettings); } - factories.put(name, factory); - } else { - AnalysisProvider type = getAnalysisProvider(toBuild, providerMap, name, typeName); - final T factory = type.get(settings, environment, name, currentSettings); - factories.put(name, factory); + if (factory != null) { + factories.put(name, factory); + continue; + } + } else if (component == Component.NORMALIZER) { + if (typeName == null || typeName.equals("custom")) { + T factory = (T) new CustomNormalizerProvider(settings, name, currentSettings); + factories.put(name, factory); + continue; + } } + AnalysisProvider type = getAnalysisProvider(component, providerMap, name, typeName); + if (type == null) { + throw new IllegalArgumentException("Unknown " + component + " type [" + typeName + "] for [" + name + "]"); + } + final T factory = type.get(settings, environment, name, currentSettings); + factories.put(name, factory); } // go over the char filters in the bindings and register the ones that are not configured @@ -330,13 +378,13 @@ public final class AnalysisRegistry implements Closeable { return factories; } - private AnalysisProvider getAnalysisProvider(String toBuild, Map> providerMap, String name, String typeName) { + private AnalysisProvider getAnalysisProvider(Component component, Map> providerMap, String name, String typeName) { if (typeName == null) { - throw new IllegalArgumentException(toBuild + " [" + name + "] must specify either an analyzer type, or a tokenizer"); + throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer"); } AnalysisProvider type = providerMap.get(typeName); if (type == null) { - throw new IllegalArgumentException("Unknown " + toBuild + " type [" + typeName + "] for [" + name + "]"); + throw new IllegalArgumentException("Unknown " + component + " type [" + typeName + "] for [" + name + "]"); } return type; } @@ -426,6 +474,7 @@ public final class AnalysisRegistry implements Closeable { public IndexAnalyzers build(IndexSettings indexSettings, Map> analyzerProviders, + Map> normalizerProviders, Map tokenizerFactoryFactories, Map charFilterFactoryFactories, Map tokenFilterFactoryFactories) { @@ -436,10 +485,15 @@ public final class AnalysisRegistry implements Closeable { DeprecationLogger deprecationLogger = new DeprecationLogger(logger); Map analyzerAliases = new HashMap<>(); Map analyzers = new HashMap<>(); + Map normalizers = new HashMap<>(); for (Map.Entry> entry : analyzerProviders.entrySet()) { processAnalyzerFactory(deprecationLogger, indexSettings, entry.getKey(), entry.getValue(), analyzerAliases, analyzers, tokenFilterFactoryFactories, charFilterFactoryFactories, tokenizerFactoryFactories); } + for (Map.Entry> entry : normalizerProviders.entrySet()) { + processNormalizerFactory(deprecationLogger, indexSettings, entry.getKey(), entry.getValue(), normalizers, + tokenFilterFactoryFactories, charFilterFactoryFactories); + } for (Map.Entry entry : analyzerAliases.entrySet()) { String key = entry.getKey(); if (analyzers.containsKey(key) && @@ -485,7 +539,7 @@ public final class AnalysisRegistry implements Closeable { } } return new IndexAnalyzers(indexSettings, defaultIndexAnalyzer, defaultSearchAnalyzer, defaultSearchQuoteAnalyzer, - unmodifiableMap(analyzers)); + unmodifiableMap(analyzers), unmodifiableMap(normalizers)); } private void processAnalyzerFactory(DeprecationLogger deprecationLogger, @@ -551,4 +605,25 @@ public final class AnalysisRegistry implements Closeable { } } } + + private void processNormalizerFactory(DeprecationLogger deprecationLogger, + IndexSettings indexSettings, + String name, + AnalyzerProvider normalizerFactory, + Map normalizers, + Map tokenFilters, + Map charFilters) { + if (normalizerFactory instanceof CustomNormalizerProvider) { + ((CustomNormalizerProvider) normalizerFactory).build(charFilters, tokenFilters); + } + Analyzer normalizerF = normalizerFactory.get(); + if (normalizerF == null) { + throw new IllegalArgumentException("normalizer [" + normalizerFactory.name() + "] created null normalizer"); + } + NamedAnalyzer normalizer = new NamedAnalyzer(name, normalizerFactory.scope(), normalizerF); + if (normalizers.containsKey(name)) { + throw new IllegalStateException("already registered analyzer with name: " + name); + } + normalizers.put(name, normalizer); + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzer.java b/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzer.java index 6185f358568..68799413907 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzer.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzer.java @@ -94,4 +94,27 @@ public final class CustomAnalyzer extends Analyzer { } return reader; } + + @Override + protected Reader initReaderForNormalization(String fieldName, Reader reader) { + for (CharFilterFactory charFilter : charFilters) { + if (charFilter instanceof MultiTermAwareComponent) { + charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent(); + reader = charFilter.create(reader); + } + } + return reader; + } + + @Override + protected TokenStream normalize(String fieldName, TokenStream in) { + TokenStream result = in; + for (TokenFilterFactory filter : tokenFilters) { + if (filter instanceof MultiTermAwareComponent) { + filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent(); + result = filter.create(result); + } + } + return result; + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CustomNormalizerProvider.java b/core/src/main/java/org/elasticsearch/index/analysis/CustomNormalizerProvider.java new file mode 100644 index 00000000000..4f50a34dd9e --- /dev/null +++ b/core/src/main/java/org/elasticsearch/index/analysis/CustomNormalizerProvider.java @@ -0,0 +1,95 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.indices.analysis.PreBuiltTokenizers; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * A custom normalizer that is built out of a char and token filters. On the + * contrary to analyzers, it does not support tokenizers and only supports a + * subset of char and token filters. + */ +public final class CustomNormalizerProvider extends AbstractIndexAnalyzerProvider { + + private final Settings analyzerSettings; + + private CustomAnalyzer customAnalyzer; + + public CustomNormalizerProvider(IndexSettings indexSettings, + String name, Settings settings) { + super(indexSettings, name, settings); + this.analyzerSettings = settings; + } + + public void build(final Map charFilters, final Map tokenFilters) { + String tokenizerName = analyzerSettings.get("tokenizer"); + if (tokenizerName != null) { + throw new IllegalArgumentException("Custom normalizer [" + name() + "] cannot configure a tokenizer"); + } + + List charFiltersList = new ArrayList<>(); + String[] charFilterNames = analyzerSettings.getAsArray("char_filter"); + for (String charFilterName : charFilterNames) { + CharFilterFactory charFilter = charFilters.get(charFilterName); + if (charFilter == null) { + throw new IllegalArgumentException("Custom normalizer [" + name() + "] failed to find char_filter under name [" + + charFilterName + "]"); + } + if (charFilter instanceof MultiTermAwareComponent == false) { + throw new IllegalArgumentException("Custom normalizer [" + name() + "] may not use char filter [" + + charFilterName + "]"); + } + charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent(); + charFiltersList.add(charFilter); + } + + List tokenFilterList = new ArrayList<>(); + String[] tokenFilterNames = analyzerSettings.getAsArray("filter"); + for (String tokenFilterName : tokenFilterNames) { + TokenFilterFactory tokenFilter = tokenFilters.get(tokenFilterName); + if (tokenFilter == null) { + throw new IllegalArgumentException("Custom Analyzer [" + name() + "] failed to find filter under name [" + + tokenFilterName + "]"); + } + if (tokenFilter instanceof MultiTermAwareComponent == false) { + throw new IllegalArgumentException("Custom normalizer [" + name() + "] may not use filter [" + tokenFilterName + "]"); + } + tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent(); + tokenFilterList.add(tokenFilter); + } + + this.customAnalyzer = new CustomAnalyzer( + PreBuiltTokenizers.KEYWORD.getTokenizerFactory(indexSettings.getIndexVersionCreated()), + charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]), + tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()]) + ); + } + + @Override + public CustomAnalyzer get() { + return this.customAnalyzer; + } +} diff --git a/core/src/main/java/org/elasticsearch/index/analysis/IndexAnalyzers.java b/core/src/main/java/org/elasticsearch/index/analysis/IndexAnalyzers.java index 127714178b5..f3200d606fb 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/IndexAnalyzers.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/IndexAnalyzers.java @@ -25,6 +25,7 @@ import org.elasticsearch.index.IndexSettings; import java.io.Closeable; import java.io.IOException; import java.util.Map; +import java.util.stream.Stream; /** * IndexAnalyzers contains a name to analyzer mapping for a specific index. @@ -38,15 +39,18 @@ public final class IndexAnalyzers extends AbstractIndexComponent implements Clos private final NamedAnalyzer defaultSearchAnalyzer; private final NamedAnalyzer defaultSearchQuoteAnalyzer; private final Map analyzers; + private final Map normalizers; private final IndexSettings indexSettings; public IndexAnalyzers(IndexSettings indexSettings, NamedAnalyzer defaultIndexAnalyzer, NamedAnalyzer defaultSearchAnalyzer, - NamedAnalyzer defaultSearchQuoteAnalyzer, Map analyzers) { + NamedAnalyzer defaultSearchQuoteAnalyzer, Map analyzers, + Map normalizers) { super(indexSettings); this.defaultIndexAnalyzer = defaultIndexAnalyzer; this.defaultSearchAnalyzer = defaultSearchAnalyzer; this.defaultSearchQuoteAnalyzer = defaultSearchQuoteAnalyzer; this.analyzers = analyzers; + this.normalizers = normalizers; this.indexSettings = indexSettings; } @@ -57,6 +61,12 @@ public final class IndexAnalyzers extends AbstractIndexComponent implements Clos return analyzers.get(name); } + /** + * Returns a normalizer mapped to the given name or null if not present + */ + public NamedAnalyzer getNormalizer(String name) { + return normalizers.get(name); + } /** * Returns the default index analyzer for this index @@ -81,7 +91,7 @@ public final class IndexAnalyzers extends AbstractIndexComponent implements Clos @Override public void close() throws IOException { - IOUtils.close(() -> analyzers.values().stream() + IOUtils.close(() -> Stream.concat(analyzers.values().stream(), normalizers.values().stream()) .filter(a -> a.scope() == AnalyzerScope.INDEX) .iterator()); } diff --git a/core/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/core/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 68807215027..f4f6266262f 100644 --- a/core/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/core/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -19,16 +19,20 @@ package org.elasticsearch.index.mapper; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Field; import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.plain.DocValuesIndexFieldData; @@ -36,6 +40,7 @@ import java.io.IOException; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import static org.elasticsearch.index.mapper.TypeParsers.parseField; @@ -70,6 +75,11 @@ public final class KeywordFieldMapper extends FieldMapper { builder = this; } + @Override + public KeywordFieldType fieldType() { + return (KeywordFieldType) super.fieldType(); + } + public Builder ignoreAbove(int ignoreAbove) { if (ignoreAbove < 0) { throw new IllegalArgumentException("[ignore_above] must be positive, got " + ignoreAbove); @@ -92,6 +102,12 @@ public final class KeywordFieldMapper extends FieldMapper { return builder; } + public Builder normalizer(NamedAnalyzer normalizer) { + fieldType().setNormalizer(normalizer); + fieldType().setSearchAnalyzer(normalizer); + return builder; + } + @Override public KeywordFieldMapper build(BuilderContext context) { setupFieldType(context); @@ -103,7 +119,7 @@ public final class KeywordFieldMapper extends FieldMapper { public static class TypeParser implements Mapper.TypeParser { @Override - public Mapper.Builder parse(String name, Map node, ParserContext parserContext) throws MapperParsingException { + public Mapper.Builder parse(String name, Map node, ParserContext parserContext) throws MapperParsingException { KeywordFieldMapper.Builder builder = new KeywordFieldMapper.Builder(name); parseField(builder, name, node, parserContext); for (Iterator> iterator = node.entrySet().iterator(); iterator.hasNext();) { @@ -125,6 +141,15 @@ public final class KeywordFieldMapper extends FieldMapper { } else if (propName.equals("eager_global_ordinals")) { builder.eagerGlobalOrdinals(XContentMapValues.nodeBooleanValue(propNode)); iterator.remove(); + } else if (propName.equals("normalizer")) { + if (propNode != null) { + NamedAnalyzer normalizer = parserContext.getIndexAnalyzers().getNormalizer(propNode.toString()); + if (normalizer == null) { + throw new MapperParsingException("normalizer [" + propNode.toString() + "] not found for field [" + name + "]"); + } + builder.normalizer(normalizer); + } + iterator.remove(); } } return builder; @@ -133,21 +158,58 @@ public final class KeywordFieldMapper extends FieldMapper { public static final class KeywordFieldType extends StringFieldType { - public KeywordFieldType() {} + private NamedAnalyzer normalizer = null; + + public KeywordFieldType() { + setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); + setSearchAnalyzer(Lucene.KEYWORD_ANALYZER); + } protected KeywordFieldType(KeywordFieldType ref) { super(ref); + this.normalizer = ref.normalizer; } public KeywordFieldType clone() { return new KeywordFieldType(this); } + @Override + public boolean equals(Object o) { + if (super.equals(o) == false) { + return false; + } + return Objects.equals(normalizer, ((KeywordFieldType) o).normalizer); + } + + @Override + public void checkCompatibility(MappedFieldType otherFT, List conflicts, boolean strict) { + super.checkCompatibility(otherFT, conflicts, strict); + KeywordFieldType other = (KeywordFieldType) otherFT; + if (Objects.equals(normalizer, other.normalizer) == false) { + conflicts.add("mapper [" + name() + "] has different [normalizer]"); + } + } + + @Override + public int hashCode() { + return 31 * super.hashCode() + Objects.hashCode(normalizer); + } + @Override public String typeName() { return CONTENT_TYPE; } + public NamedAnalyzer normalizer() { + return normalizer; + } + + public void setNormalizer(NamedAnalyzer normalizer) { + checkIfFrozen(); + this.normalizer = normalizer; + } + @Override public Query nullValueQuery() { if (nullValue() == null) { @@ -171,13 +233,25 @@ public final class KeywordFieldMapper extends FieldMapper { BytesRef binaryValue = (BytesRef) value; return binaryValue.utf8ToString(); } + + @Override + protected BytesRef indexedValueForSearch(Object value) { + if (value == null) { + return null; + } + if (value instanceof BytesRef) { + value = ((BytesRef) value).utf8ToString(); + } + return searchAnalyzer().normalize(name(), value.toString()); + } } private Boolean includeInAll; private int ignoreAbove; protected KeywordFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, - int ignoreAbove, Boolean includeInAll, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { + int ignoreAbove, Boolean includeInAll, + Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0; this.ignoreAbove = ignoreAbove; @@ -196,6 +270,11 @@ public final class KeywordFieldMapper extends FieldMapper { return (KeywordFieldMapper) super.clone(); } + @Override + public KeywordFieldType fieldType() { + return (KeywordFieldType) super.fieldType(); + } + // pkg-private for testing Boolean includeInAll() { return includeInAll; @@ -203,7 +282,7 @@ public final class KeywordFieldMapper extends FieldMapper { @Override protected void parseCreateField(ParseContext context, List fields) throws IOException { - final String value; + String value; if (context.externalValueSet()) { value = context.externalValue().toString(); } else { @@ -219,6 +298,27 @@ public final class KeywordFieldMapper extends FieldMapper { return; } + final NamedAnalyzer normalizer = fieldType().normalizer(); + if (normalizer != null) { + try (final TokenStream ts = normalizer.tokenStream(name(), value)) { + final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + if (ts.incrementToken() == false) { + throw new IllegalStateException("The normalization token stream is " + + "expected to produce exactly 1 token, but got 0 for analyzer " + + normalizer + " and input \"" + value + "\""); + } + final String newValue = termAtt.toString(); + if (ts.incrementToken()) { + throw new IllegalStateException("The normalization token stream is " + + "expected to produce exactly 1 token, but got 2+ for analyzer " + + normalizer + " and input \"" + value + "\""); + } + ts.end(); + value = newValue; + } + } + if (context.includeInAll(includeInAll, this)) { context.allEntries().addText(fieldType().name(), value, fieldType().boost()); } @@ -263,5 +363,11 @@ public final class KeywordFieldMapper extends FieldMapper { if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) { builder.field("ignore_above", ignoreAbove); } + + if (fieldType().normalizer() != null) { + builder.field("normalizer", fieldType().normalizer().name()); + } else if (includeDefaults) { + builder.nullField("normalizer"); + } } } diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 89c9421198d..fae4c75b655 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -170,8 +170,9 @@ public final class AnalysisModule { NamedRegistry> tokenFilters = setupTokenFilters(plugins, hunspellService); NamedRegistry> tokenizers = setupTokenizers(plugins); NamedRegistry>> analyzers = setupAnalyzers(plugins); + NamedRegistry>> normalizers = setupNormalizers(plugins); analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers - .getRegistry(), analyzers.getRegistry()); + .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry()); } HunspellService getHunspellService() { @@ -334,6 +335,13 @@ public final class AnalysisModule { return analyzers; } + private NamedRegistry>> setupNormalizers(List plugins) { + NamedRegistry>> normalizers = new NamedRegistry<>("normalizer"); + // TODO: provide built-in normalizer providers? + // TODO: pluggability? + return normalizers; + } + private static AnalysisModule.AnalysisProvider requriesAnalysisSettings(AnalysisModule.AnalysisProvider provider) { return new AnalysisModule.AnalysisProvider() { @Override diff --git a/core/src/test/java/org/elasticsearch/gateway/GatewayIndexStateIT.java b/core/src/test/java/org/elasticsearch/gateway/GatewayIndexStateIT.java index c8607e0af31..25153576b6b 100644 --- a/core/src/test/java/org/elasticsearch/gateway/GatewayIndexStateIT.java +++ b/core/src/test/java/org/elasticsearch/gateway/GatewayIndexStateIT.java @@ -435,7 +435,7 @@ public class GatewayIndexStateIT extends ESIntegTestCase { assertEquals(ex.getMessage(), "Failed to verify index " + metaData.getIndex()); assertNotNull(ex.getCause()); assertEquals(IllegalArgumentException.class, ex.getCause().getClass()); - assertEquals(ex.getCause().getMessage(), "Unknown tokenfilter type [icu_collation] for [myCollator]"); + assertEquals(ex.getCause().getMessage(), "Unknown filter type [icu_collation] for [myCollator]"); } /** diff --git a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java index 46281c812f1..e62f2178a4f 100644 --- a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java +++ b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java @@ -148,7 +148,7 @@ public class IndexModuleTests extends ESTestCase { public void testWrapperIsBound() throws IOException { IndexModule module = new IndexModule(indexSettings, - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); module.setSearcherWrapper((s) -> new Wrapper()); module.engineFactory.set(new MockEngineFactory(AssertingDirectoryReader.class)); @@ -168,7 +168,7 @@ public class IndexModuleTests extends ESTestCase { .build(); IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(index, settings); IndexModule module = new IndexModule(indexSettings, - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); module.addIndexStore("foo_store", FooStore::new); try { module.addIndexStore("foo_store", FooStore::new); @@ -193,7 +193,7 @@ public class IndexModuleTests extends ESTestCase { }; IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(index, settings); IndexModule module = new IndexModule(indexSettings, - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); module.addIndexEventListener(eventListener); IndexService indexService = newIndexService(module); IndexSettings x = indexService.getIndexSettings(); @@ -208,7 +208,7 @@ public class IndexModuleTests extends ESTestCase { public void testListener() throws IOException { Setting booleanSetting = Setting.boolSetting("index.foo.bar", false, Property.Dynamic, Property.IndexScope); IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings, booleanSetting), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); Setting booleanSetting2 = Setting.boolSetting("index.foo.bar.baz", false, Property.Dynamic, Property.IndexScope); AtomicBoolean atomicBoolean = new AtomicBoolean(false); module.addSettingsUpdateConsumer(booleanSetting, atomicBoolean::set); @@ -228,7 +228,7 @@ public class IndexModuleTests extends ESTestCase { public void testAddIndexOperationListener() throws IOException { IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); AtomicBoolean executed = new AtomicBoolean(false); IndexingOperationListener listener = new IndexingOperationListener() { @Override @@ -257,7 +257,7 @@ public class IndexModuleTests extends ESTestCase { public void testAddSearchOperationListener() throws IOException { IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); AtomicBoolean executed = new AtomicBoolean(false); SearchOperationListener listener = new SearchOperationListener() { @@ -291,7 +291,7 @@ public class IndexModuleTests extends ESTestCase { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); module.addSimilarity("test_similarity", (string, settings) -> new SimilarityProvider() { @Override public String name() { @@ -315,7 +315,7 @@ public class IndexModuleTests extends ESTestCase { public void testFrozen() { IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); module.freeze(); String msg = "Can't modify IndexModule once the index service has been created"; assertEquals(msg, expectThrows(IllegalStateException.class, () -> module.addSearchOperationListener(null)).getMessage()); @@ -334,7 +334,7 @@ public class IndexModuleTests extends ESTestCase { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); Exception ex = expectThrows(IllegalArgumentException.class, () -> newIndexService(module)); assertEquals("Unknown Similarity type [test_similarity] for [my_similarity]", ex.getMessage()); } @@ -346,7 +346,7 @@ public class IndexModuleTests extends ESTestCase { .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .build(); IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); Exception ex = expectThrows(IllegalArgumentException.class, () -> newIndexService(module)); assertEquals("Similarity [my_similarity] must have an associated type", ex.getMessage()); } @@ -356,7 +356,7 @@ public class IndexModuleTests extends ESTestCase { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); module.forceQueryCacheProvider((a, b) -> new CustomQueryCache()); expectThrows(AlreadySetException.class, () -> module.forceQueryCacheProvider((a, b) -> new CustomQueryCache())); IndexService indexService = newIndexService(module); @@ -369,7 +369,7 @@ public class IndexModuleTests extends ESTestCase { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); IndexService indexService = newIndexService(module); assertTrue(indexService.cache().query() instanceof IndexQueryCache); indexService.close("simon says", false); @@ -381,7 +381,7 @@ public class IndexModuleTests extends ESTestCase { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap())); + new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); module.forceQueryCacheProvider((a, b) -> new CustomQueryCache()); IndexService indexService = newIndexService(module); assertTrue(indexService.cache().query() instanceof DisabledQueryCache); diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java index dedd478e3bf..432ff5247b5 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java @@ -65,7 +65,7 @@ public class AnalysisRegistryTests extends ESTestCase { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); registry = new AnalysisRegistry(new Environment(settings), - emptyMap(), emptyMap(), emptyMap(), emptyMap()); + emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); } public void testDefaultAnalyzers() throws IOException { @@ -76,7 +76,8 @@ public class AnalysisRegistryTests extends ESTestCase { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); - IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap()) + IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), + emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) .build(idxSettings); assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); @@ -88,7 +89,7 @@ public class AnalysisRegistryTests extends ESTestCase { Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build(); IndexAnalyzers indexAnalyzers = registry.build(IndexSettingsModule.newIndexSettings("index", settings), singletonMap("default", analyzerProvider("default")) - , emptyMap(), emptyMap(), emptyMap()); + , emptyMap(), emptyMap(), emptyMap(), emptyMap()); assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); @@ -100,7 +101,7 @@ public class AnalysisRegistryTests extends ESTestCase { AnalyzerProvider defaultIndex = new PreBuiltAnalyzerProvider("default_index", AnalyzerScope.INDEX, new EnglishAnalyzer()); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> registry.build(IndexSettingsModule.newIndexSettings("index", settings), - singletonMap("default_index", defaultIndex), emptyMap(), emptyMap(), emptyMap())); + singletonMap("default_index", defaultIndex), emptyMap(), emptyMap(), emptyMap(), emptyMap())); assertTrue(e.getMessage().contains("[index.analysis.analyzer.default_index] is not supported")); } @@ -109,7 +110,7 @@ public class AnalysisRegistryTests extends ESTestCase { VersionUtils.getPreviousVersion(Version.V_5_0_0_alpha1)); Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build(); IndexAnalyzers indexAnalyzers = registry.build(IndexSettingsModule.newIndexSettings("index", settings), - singletonMap("default_index", analyzerProvider("default_index")), emptyMap(), emptyMap(), emptyMap()); + singletonMap("default_index", analyzerProvider("default_index")), emptyMap(), emptyMap(), emptyMap(), emptyMap()); assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); @@ -121,7 +122,7 @@ public class AnalysisRegistryTests extends ESTestCase { Version version = VersionUtils.randomVersion(random()); Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build(); IndexAnalyzers indexAnalyzers = registry.build(IndexSettingsModule.newIndexSettings("index", settings), - singletonMap("default_search", analyzerProvider("default_search")), emptyMap(), emptyMap(), emptyMap()); + singletonMap("default_search", analyzerProvider("default_search")), emptyMap(), emptyMap(), emptyMap(), emptyMap()); assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); @@ -135,7 +136,7 @@ public class AnalysisRegistryTests extends ESTestCase { analyzers.put("default_index", analyzerProvider("default_index")); analyzers.put("default_search", analyzerProvider("default_search")); IndexAnalyzers indexAnalyzers = registry.build(IndexSettingsModule.newIndexSettings("index", settings), - analyzers, Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap()); + analyzers, Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap(), Collections.emptyMap()); assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); @@ -196,10 +197,11 @@ public class AnalysisRegistryTests extends ESTestCase { Settings indexSettings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); - IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap()) + IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), + emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) .build(idxSettings); IndexAnalyzers otherIndexAnalyzers = new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), - emptyMap()).build(idxSettings); + emptyMap(), emptyMap()).build(idxSettings); final int numIters = randomIntBetween(5, 20); for (int i = 0; i < numIters; i++) { PreBuiltAnalyzers preBuiltAnalyzers = RandomPicks.randomFrom(random(), PreBuiltAnalyzers.values()); @@ -219,7 +221,8 @@ public class AnalysisRegistryTests extends ESTestCase { IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap()).build(idxSettings)); + () -> new AnalysisRegistry(new Environment(settings), + emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()).build(idxSettings)); assertThat(e.getMessage(), equalTo("analyzer [test_analyzer] must specify either an analyzer type, or a tokenizer")); } @@ -228,7 +231,8 @@ public class AnalysisRegistryTests extends ESTestCase { Settings indexSettings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); - IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap()) + IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), + emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) .build(idxSettings); indexAnalyzers.close(); indexAnalyzers.close(); diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java new file mode 100644 index 00000000000..3e71a609737 --- /dev/null +++ b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java @@ -0,0 +1,102 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.ESTokenStreamTestCase; + +import java.io.IOException; + +public class CustomNormalizerTests extends ESTokenStreamTestCase { + + public void testBasics() throws IOException { + Settings settings = Settings.builder() + .putArray("index.analysis.normalizer.my_normalizer.filter", "lowercase", "asciifolding") + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); + assertNull(analysis.indexAnalyzers.get("my_normalizer")); + NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer"); + assertNotNull(normalizer); + assertEquals("my_normalizer", normalizer.name()); + assertTokenStreamContents(normalizer.tokenStream("foo", "Cet été-là"), new String[] {"cet ete-la"}); + assertEquals(new BytesRef("cet ete-la"), normalizer.normalize("foo", "Cet été-là")); + } + + public void testUnknownType() { + Settings settings = Settings.builder() + .put("index.analysis.normalizer.my_normalizer.type", "foobar") + .putArray("index.analysis.normalizer.my_normalizer.filter", "lowercase", "asciifolding") + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings)); + assertEquals("Unknown normalizer type [foobar] for [my_normalizer]", e.getMessage()); + } + + public void testTokenizer() throws IOException { + Settings settings = Settings.builder() + .put("index.analysis.normalizer.my_normalizer.tokenizer", "keyword") + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings)); + assertEquals("Custom normalizer [my_normalizer] cannot configure a tokenizer", e.getMessage()); + } + + public void testCharFilters() throws IOException { + Settings settings = Settings.builder() + .put("index.analysis.char_filter.my_mapping.type", "mapping") + .putArray("index.analysis.char_filter.my_mapping.mappings", "a => z") + .putArray("index.analysis.normalizer.my_normalizer.char_filter", "my_mapping") + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); + assertNull(analysis.indexAnalyzers.get("my_normalizer")); + NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer"); + assertNotNull(normalizer); + assertEquals("my_normalizer", normalizer.name()); + assertTokenStreamContents(normalizer.tokenStream("foo", "abc"), new String[] {"zbc"}); + assertEquals(new BytesRef("zbc"), normalizer.normalize("foo", "abc")); + } + + public void testIllegalFilters() throws IOException { + Settings settings = Settings.builder() + .putArray("index.analysis.normalizer.my_normalizer.filter", "porter_stem") + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings)); + assertEquals("Custom normalizer [my_normalizer] may not use filter [porter_stem]", e.getMessage()); + } + + public void testIllegalCharFilters() throws IOException { + Settings settings = Settings.builder() + .putArray("index.analysis.normalizer.my_normalizer.char_filter", "html_strip") + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings)); + assertEquals("Custom normalizer [my_normalizer] may not use char filter [html_strip]", e.getMessage()); + } +} diff --git a/core/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java b/core/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java index 8816baceb00..f0ca8292f4f 100644 --- a/core/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java +++ b/core/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java @@ -2313,7 +2313,7 @@ public class InternalEngineTests extends ESTestCase { Index index = new Index(indexName, "_na_"); IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(index, settings); NamedAnalyzer defaultAnalyzer = new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer()); - IndexAnalyzers indexAnalyzers = new IndexAnalyzers(indexSettings, defaultAnalyzer, defaultAnalyzer, defaultAnalyzer, Collections.emptyMap()); + IndexAnalyzers indexAnalyzers = new IndexAnalyzers(indexSettings, defaultAnalyzer, defaultAnalyzer, defaultAnalyzer, Collections.emptyMap(), Collections.emptyMap()); SimilarityService similarityService = new SimilarityService(indexSettings, Collections.emptyMap()); MapperRegistry mapperRegistry = new IndicesModule(Collections.emptyList()).getMapperRegistry(); mapperService = new MapperService(indexSettings, indexAnalyzers, xContentRegistry, similarityService, mapperRegistry, diff --git a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java index 6d3c5bcbce1..bffe58db3a6 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java @@ -25,8 +25,10 @@ import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableFieldType; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.compress.CompressedXContent; +import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.index.IndexService; +import org.elasticsearch.index.mapper.MapperService.MergeReason; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESSingleNodeTestCase; import org.elasticsearch.test.InternalSettingsPlugin; @@ -51,7 +53,11 @@ public class KeywordFieldMapperTests extends ESSingleNodeTestCase { @Before public void setup() { - indexService = createIndex("test"); + indexService = createIndex("test", Settings.builder() + .put("index.analysis.normalizer.my_lowercase.type", "custom") + .putArray("index.analysis.normalizer.my_lowercase.filter", "lowercase") + .put("index.analysis.normalizer.my_asciifolding.type", "custom") + .putArray("index.analysis.normalizer.my_asciifolding.filter", "asciifolding").build()); parser = indexService.mapperService().documentMapperParser(); } @@ -283,6 +289,62 @@ public class KeywordFieldMapperTests extends ESSingleNodeTestCase { assertFalse(fields[0].fieldType().omitNorms()); } + public void testNormalizer() throws IOException { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field") + .field("type", "keyword").field("normalizer", "my_lowercase").endObject().endObject() + .endObject().endObject().string(); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + + assertEquals(mapping, mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder() + .startObject() + .field("field", "AbC") + .endObject() + .bytes()); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + + assertEquals(new BytesRef("abc"), fields[0].binaryValue()); + IndexableFieldType fieldType = fields[0].fieldType(); + assertThat(fieldType.omitNorms(), equalTo(true)); + assertFalse(fieldType.tokenized()); + assertFalse(fieldType.stored()); + assertThat(fieldType.indexOptions(), equalTo(IndexOptions.DOCS)); + assertThat(fieldType.storeTermVectors(), equalTo(false)); + assertThat(fieldType.storeTermVectorOffsets(), equalTo(false)); + assertThat(fieldType.storeTermVectorPositions(), equalTo(false)); + assertThat(fieldType.storeTermVectorPayloads(), equalTo(false)); + assertEquals(DocValuesType.NONE, fieldType.docValuesType()); + + assertEquals(new BytesRef("abc"), fields[1].binaryValue()); + fieldType = fields[1].fieldType(); + assertThat(fieldType.indexOptions(), equalTo(IndexOptions.NONE)); + assertEquals(DocValuesType.SORTED_SET, fieldType.docValuesType()); + } + + public void testUpdateNormalizer() throws IOException { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field") + .field("type", "keyword").field("normalizer", "my_lowercase").endObject().endObject() + .endObject().endObject().string(); + indexService.mapperService().merge("type", new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE, randomBoolean()); + + String mapping2 = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field") + .field("type", "keyword").field("normalizer", "my_asciifolding").endObject().endObject() + .endObject().endObject().string(); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> indexService.mapperService().merge("type", + new CompressedXContent(mapping2), MergeReason.MAPPING_UPDATE, randomBoolean())); + assertEquals( + "Mapper for [field] conflicts with existing mapping in other types:\n[mapper [field] has different [normalizer]]", + e.getMessage()); + } + public void testEmptyName() throws IOException { String mapping = XContentFactory.jsonBuilder().startObject() .startObject("type") diff --git a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java index fbbabf8ee3a..00eecc669f8 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldTypeTests.java @@ -20,22 +20,41 @@ package org.elasticsearch.index.mapper; import com.carrotsearch.randomizedtesting.generators.RandomStrings; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.Term; import org.apache.lucene.queries.TermsQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.TermQuery; +import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.unit.Fuzziness; -import org.elasticsearch.index.mapper.KeywordFieldMapper; -import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.analysis.AnalyzerScope; +import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.mapper.KeywordFieldMapper.KeywordFieldType; import org.elasticsearch.index.mapper.MappedFieldType.Relation; +import org.junit.Before; import java.io.IOException; import java.util.Arrays; public class KeywordFieldTypeTests extends FieldTypeTestCase { + + @Before + public void setupProperties() { + addModifier(new Modifier("normalizer", false) { + @Override + public void modify(MappedFieldType ft) { + ((KeywordFieldType) ft).setNormalizer(Lucene.KEYWORD_ANALYZER); + } + }); + } + @Override protected MappedFieldType createDefaultFieldType() { return new KeywordFieldMapper.KeywordFieldType(); @@ -62,6 +81,31 @@ public class KeywordFieldTypeTests extends FieldTypeTestCase { assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); } + public void testTermQueryWithNormalizer() { + MappedFieldType ft = createDefaultFieldType(); + ft.setName("field"); + ft.setIndexOptions(IndexOptions.DOCS); + Analyzer normalizer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer in = new WhitespaceTokenizer(); + TokenFilter out = new LowerCaseFilter(in); + return new TokenStreamComponents(in, out); + } + @Override + protected TokenStream normalize(String fieldName, TokenStream in) { + return new LowerCaseFilter(in); + } + }; + ft.setSearchAnalyzer(new NamedAnalyzer("my_normalizer", AnalyzerScope.INDEX, normalizer)); + assertEquals(new TermQuery(new Term("field", "foo bar")), ft.termQuery("fOo BaR", null)); + + ft.setIndexOptions(IndexOptions.NONE); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> ft.termQuery("bar", null)); + assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); + } + public void testTermsQuery() { MappedFieldType ft = createDefaultFieldType(); ft.setName("field"); diff --git a/core/src/test/java/org/elasticsearch/index/mapper/ParentFieldMapperTests.java b/core/src/test/java/org/elasticsearch/index/mapper/ParentFieldMapperTests.java index 2b3aad750dd..225940d8eda 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/ParentFieldMapperTests.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/ParentFieldMapperTests.java @@ -101,7 +101,7 @@ public class ParentFieldMapperTests extends ESSingleNodeTestCase { IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(index, Settings.EMPTY); NamedAnalyzer namedAnalyzer = new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer()); IndexAnalyzers indexAnalyzers = new IndexAnalyzers(indexSettings, namedAnalyzer, namedAnalyzer, namedAnalyzer, - Collections.emptyMap()); + Collections.emptyMap(), Collections.emptyMap()); SimilarityService similarityService = new SimilarityService(indexSettings, Collections.emptyMap()); MapperService mapperService = new MapperService(indexSettings, indexAnalyzers, xContentRegistry(), similarityService, new IndicesModule(emptyList()).getMapperRegistry(), () -> null); diff --git a/docs/reference/analysis.asciidoc b/docs/reference/analysis.asciidoc index 42b1ef65d24..a8299f45e19 100644 --- a/docs/reference/analysis.asciidoc +++ b/docs/reference/analysis.asciidoc @@ -112,6 +112,8 @@ include::analysis/testing.asciidoc[] include::analysis/analyzers.asciidoc[] +include::analysis/normalizers.asciidoc[] + include::analysis/tokenizers.asciidoc[] include::analysis/tokenfilters.asciidoc[] diff --git a/docs/reference/analysis/normalizers.asciidoc b/docs/reference/analysis/normalizers.asciidoc new file mode 100644 index 00000000000..0287f140c74 --- /dev/null +++ b/docs/reference/analysis/normalizers.asciidoc @@ -0,0 +1,57 @@ +[[analysis-normalizers]] +== Normalizers + +experimental[] + +Normalizers are similar to analyzers except that they may only emit a single +token. As a consequence, they do not have a tokenizer and only accept a subset +of the available char filters and token filters. Only the filters that work on +a per-character basis are allowed. For instance a lowercasing filter would be +allowed, but not a stemming filter, which needs to look at the keyword as a +whole. + +[float] +=== Custom analyzers + +Elasticsearch does not ship with built-in normalizers so far, so the only way +to get one is by building a custom one. Custom normalizers take a list of char +<> and a list of +<>. + +[source,js] +-------------------------------- +PUT index +{ + "settings": { + "analysis": { + "char_filter": { + "quote": { + "type": "mapping", + "mappings": [ + "« => \"", + "» => \"" + ] + } + }, + "normalizer": { + "my_normalizer": { + "type": "custom", + "char_filter": ["quote"], + "filter": ["lowercase", "asciifolding"] + } + } + } + }, + "mappings": { + "type": { + "properties": { + "foo": { + "type": "keyword", + "normalizer": "my_normalizer" + } + } + } + } +} +-------------------------------- +// CONSOLE diff --git a/docs/reference/mapping/params.asciidoc b/docs/reference/mapping/params.asciidoc index e7d2d7ac0c8..24220356233 100644 --- a/docs/reference/mapping/params.asciidoc +++ b/docs/reference/mapping/params.asciidoc @@ -8,6 +8,7 @@ parameters that are used by <>: The following mapping parameters are common to some or all field datatypes: * <> +* <> * <> * <> * <> @@ -34,6 +35,8 @@ The following mapping parameters are common to some or all field datatypes: include::params/analyzer.asciidoc[] +include::params/normalizer.asciidoc[] + include::params/boost.asciidoc[] include::params/coerce.asciidoc[] diff --git a/docs/reference/mapping/params/normalizer.asciidoc b/docs/reference/mapping/params/normalizer.asciidoc new file mode 100644 index 00000000000..c0636763e52 --- /dev/null +++ b/docs/reference/mapping/params/normalizer.asciidoc @@ -0,0 +1,163 @@ +[[normalizer]] +=== `normalizer` + +The `normalizer` property of <> fields is similar to +<> except that it guarantees that the analysis chain +produces a single token. + +The `normalizer` is applied prior to indexing the keyword, as well as at +search-time when the `keyword` field is searched via a query parser such as +the <> query. + +[source,js] +-------------------------------- +PUT index +{ + "settings": { + "analysis": { + "normalizer": { + "my_normalizer": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase", "asciifolding"] + } + } + } + }, + "mappings": { + "type": { + "properties": { + "foo": { + "type": "keyword", + "normalizer": "my_normalizer" + } + } + } + } +} + +PUT index/type/1 +{ + "foo": "BÀR" +} + +PUT index/type/2 +{ + "foo": "bar" +} + +PUT index/type/3 +{ + "foo": "baz" +} + +POST index/_refresh + +GET index/_search +{ + "query": { + "match": { + "foo": "BAR" + } + } +} +-------------------------------- +// CONSOLE + +The above query matches documents 1 and 2 since `BÀR` is converted to `bar` at +both index and query time. + +[source,js] +---------------------------- +{ + "took": $body.took, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "failed": 0 + }, + "hits": { + "total": 2, + "max_score": 0.2876821, + "hits": [ + { + "_index": "index", + "_type": "type", + "_id": "2", + "_score": 0.2876821, + "_source": { + "foo": "bar" + } + }, + { + "_index": "index", + "_type": "type", + "_id": "1", + "_score": 0.2876821, + "_source": { + "foo": "BÀR" + } + } + ] + } +} +---------------------------- +// TESTRESPONSE[s/"took".*/"took": "$body.took",/] + +Also, the fact that keywords are converted prior to indexing also means that +aggregations return normalized values: + +[source,js] +---------------------------- +GET index/_search +{ + "size": 0, + "aggs": { + "foo_terms": { + "terms": { + "field": "foo" + } + } + } +} +-------------------------------- +// CONSOLE +// TEST[continued] + +returns + +[source,js] +---------------------------- +{ + "took": 43, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "failed": 0 + }, + "hits": { + "total": 3, + "max_score": 0.0, + "hits": [] + }, + "aggregations": { + "foo_terms": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "bar", + "doc_count": 2 + }, + { + "key": "baz", + "doc_count": 1 + } + ] + } + } +} +---------------------------- +// TESTRESPONSE[s/"took".*/"took": "$body.took",/] diff --git a/docs/reference/mapping/types/keyword.asciidoc b/docs/reference/mapping/types/keyword.asciidoc index 7c09ef46e55..316c92a73e6 100644 --- a/docs/reference/mapping/types/keyword.asciidoc +++ b/docs/reference/mapping/types/keyword.asciidoc @@ -109,6 +109,12 @@ The following parameters are accepted by `keyword` fields: Which scoring algorithm or _similarity_ should be used. Defaults to `classic`, which uses TF/IDF. +<>:: + + experimental[] + How to pre-process the keyword prior to indexing. Defaults to `null`, + meaning the keyword is kept as-is. + NOTE: Indexes imported from 2.x do not support `keyword`. Instead they will attempt to downgrade `keyword` into `string`. This allows you to merge modern mappings with legacy mappings. Long lived indexes will have to be recreated