From b24326271e6778d5d595005e7e1e4258e7e7ee24 Mon Sep 17 00:00:00 2001 From: Matt Weber Date: Wed, 10 May 2017 01:35:11 -0700 Subject: [PATCH] Add ICUCollationFieldMapper (#24126) Adds a new "icu_collation" field type that exposes lucene's ICUCollationDocValuesField. ICUCollationDocValuesField is the replacement for ICUCollationKeyFilter which has been deprecated since Lucene 5. --- .../index/mapper/StringFieldType.java | 6 +- docs/plugins/analysis-icu.asciidoc | 100 ++- plugins/analysis-icu/build.gradle | 2 +- .../ICUCollationKeywordFieldMapper.java | 746 ++++++++++++++++++ .../analysis/icu/AnalysisICUPlugin.java | 29 +- .../index/mapper/CollationFieldTypeTests.java | 145 ++++ .../ICUCollationKeywordFieldMapperIT.java | 443 +++++++++++ .../ICUCollationKeywordFieldMapperTests.java | 342 ++++++++ .../index/mapper/FieldTypeTestCase.java | 6 +- 9 files changed, 1774 insertions(+), 45 deletions(-) create mode 100644 plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java create mode 100644 plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/CollationFieldTypeTests.java create mode 100644 plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapperIT.java create mode 100644 plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapperTests.java rename {core/src/test => test/framework/src/main}/java/org/elasticsearch/index/mapper/FieldTypeTestCase.java (99%) diff --git a/core/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java b/core/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java index a7d59fcfb42..37834b93a1e 100644 --- a/core/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java +++ b/core/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java @@ -57,7 +57,7 @@ public abstract class StringFieldType extends TermBasedFieldType { } @Override - public final Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int maxExpansions, + public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int maxExpansions, boolean transpositions) { failIfNotIndexed(); return new FuzzyQuery(new Term(name(), indexedValueForSearch(value)), @@ -65,7 +65,7 @@ public abstract class StringFieldType extends TermBasedFieldType { } @Override - public final Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) { + public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) { failIfNotIndexed(); PrefixQuery query = new PrefixQuery(new Term(name(), indexedValueForSearch(value))); if (method != null) { @@ -75,7 +75,7 @@ public abstract class StringFieldType extends TermBasedFieldType { } @Override - public final Query regexpQuery(String value, int flags, int maxDeterminizedStates, + public Query regexpQuery(String value, int flags, int maxDeterminizedStates, MultiTermQuery.RewriteMethod method, QueryShardContext context) { failIfNotIndexed(); RegexpQuery query = new RegexpQuery(new Term(name(), indexedValueForSearch(value)), flags, maxDeterminizedStates); diff --git a/docs/plugins/analysis-icu.asciidoc b/docs/plugins/analysis-icu.asciidoc index fd322c112e6..d95766bb190 100644 --- a/docs/plugins/analysis-icu.asciidoc +++ b/docs/plugins/analysis-icu.asciidoc @@ -302,50 +302,46 @@ PUT icu_sample -------------------------------------------------- // CONSOLE + [[analysis-icu-collation]] ==== ICU Collation Token Filter +[WARNING] +====== +This token filter has been deprecated since Lucene 5.0. Please use +<>. +====== + +[[analysis-icu-collation-keyword-field]] +==== ICU Collation Keyword Field + Collations are used for sorting documents in a language-specific word order. -The `icu_collation` token filter is available to all indices and defaults to -using the -{defguide}/sorting-collations.html#uca[DUCET collation], +The `icu_collation_keyword` field type is available to all indices and will encode +the terms directly as bytes in a doc values field and a single indexed token just +like a standard {ref}/keyword.html[Keyword Field]. + +Defaults to using {defguide}/sorting-collations.html#uca[DUCET collation], which is a best-effort attempt at language-neutral sorting. Below is an example of how to set up a field for sorting German names in ``phonebook'' order: [source,js] --------------------------------------------------- -PUT /my_index +-------------------------- +PUT my_index { - "settings": { - "analysis": { - "filter": { - "german_phonebook": { - "type": "icu_collation", - "language": "de", - "country": "DE", - "variant": "@collation=phonebook" - } - }, - "analyzer": { - "german_phonebook": { - "tokenizer": "keyword", - "filter": [ "german_phonebook" ] - } - } - } - }, "mappings": { "user": { "properties": { - "name": { <1> + "name": { <1> "type": "text", "fields": { - "sort": { <2> - "type": "text", - "fielddata": true, - "analyzer": "german_phonebook" + "sort": { <2> + "type": "icu_collation_keyword", + "index": false, + "language": "de", + "country": "DE", + "variant": "@collation=phonebook" } } } @@ -364,15 +360,47 @@ GET _search <3> "sort": "name.sort" } --------------------------------------------------- +-------------------------- // CONSOLE <1> The `name` field uses the `standard` analyzer, and so support full text queries. -<2> The `name.sort` field uses the `keyword` analyzer to preserve the name as - a single token, and applies the `german_phonebook` token filter to index - the value in German phonebook sort order. +<2> The `name.sort` field is an `icu_collation_keyword` field that will preserve the name as + a single token doc values, and applies the German ``phonebook'' order. <3> An example query which searches the `name` field and sorts on the `name.sort` field. +==== Parameters for ICU Collation Keyword Fields + +The following parameters are accepted by `icu_collation_keyword` fields: + +[horizontal] + +`doc_values`:: + + Should the field be stored on disk in a column-stride fashion, so that it + can later be used for sorting, aggregations, or scripting? Accepts `true` + (default) or `false`. + +`index`:: + + Should the field be searchable? Accepts `true` (default) or `false`. + +`null_value`:: + + Accepts a string value which is substituted for any explicit `null` + values. Defaults to `null`, which means the field is treated as missing. + +`store`:: + + Whether the field value should be stored and retrievable separately from + the {ref}/mapping-source-field.html[`_source`] field. Accepts `true` or `false` + (default). + +`fields`:: + + Multi-fields allow the same string value to be indexed in multiple ways for + different purposes, such as one field for search and a multi-field for + sorting and aggregations. + ===== Collation options `strength`:: @@ -404,14 +432,14 @@ Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for strength `quaternary` to be either shifted or non-ignorable. Which boils down to ignoring punctuation and whitespace. -`caseLevel`:: +`case_level`:: Possible values: `true` or `false` (default). Whether case level sorting is required. When strength is set to `primary` this will ignore accent differences. -`caseFirst`:: +`case_first`:: Possible values: `lower` or `upper`. Useful to control which case is sorted first when case is not ignored for strength `tertiary`. The default depends on @@ -424,11 +452,11 @@ according to their numeric representation. For example the value `egg-9` is sorted before the value `egg-21`. -`variableTop`:: +`variable_top`:: Single character or contraction. Controls what is variable for `alternate`. -`hiraganaQuaternaryMode`:: +`hiragana_quaternary_mode`:: Possible values: `true` or `false`. Distinguishing between Katakana and Hiragana characters in `quaternary` strength. diff --git a/plugins/analysis-icu/build.gradle b/plugins/analysis-icu/build.gradle index 9ed155b5fc4..53f2747c0a2 100644 --- a/plugins/analysis-icu/build.gradle +++ b/plugins/analysis-icu/build.gradle @@ -29,4 +29,4 @@ dependencies { dependencyLicenses { mapping from: /lucene-.*/, to: 'lucene' -} +} \ No newline at end of file diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java new file mode 100644 index 00000000000..408ad0a4543 --- /dev/null +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java @@ -0,0 +1,746 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper; + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RawCollationKey; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.util.ULocale; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.Fuzziness; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.index.analysis.IndexableBinaryStringTools; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.fielddata.plain.DocValuesIndexFieldData; +import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.search.DocValueFormat; +import org.joda.time.DateTimeZone; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.function.LongSupplier; + +public class ICUCollationKeywordFieldMapper extends FieldMapper { + + public static final String CONTENT_TYPE = "icu_collation_keyword"; + + public static class Defaults { + public static final MappedFieldType FIELD_TYPE = new CollationFieldType(); + + static { + FIELD_TYPE.setTokenized(false); + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.setIndexOptions(IndexOptions.DOCS); + FIELD_TYPE.freeze(); + } + + public static final String NULL_VALUE = null; + } + + public static final class CollationFieldType extends StringFieldType { + private Collator collator = null; + + public CollationFieldType() { + setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); + setSearchAnalyzer(Lucene.KEYWORD_ANALYZER); + } + + protected CollationFieldType(CollationFieldType ref) { + super(ref); + this.collator = ref.collator; + } + + public CollationFieldType clone() { + return new CollationFieldType(this); + } + + @Override + public boolean equals(Object o) { + return super.equals(o) && Objects.equals(collator, ((CollationFieldType) o).collator); + } + + @Override + public void checkCompatibility(MappedFieldType otherFT, List conflicts, boolean strict) { + super.checkCompatibility(otherFT, conflicts, strict); + CollationFieldType other = (CollationFieldType) otherFT; + if (!Objects.equals(collator, other.collator)) { + conflicts.add("mapper [" + name() + "] has different [collator]"); + } + } + + @Override + public int hashCode() { + return 31 * super.hashCode() + Objects.hashCode(collator); + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + public Collator collator() { + return collator; + } + + public void setCollator(Collator collator) { + checkIfFrozen(); + this.collator = collator.isFrozen() ? collator : collator.freeze(); + } + + @Override + public Query nullValueQuery() { + if (nullValue() == null) { + return null; + } + return termQuery(nullValue(), null); + } + + @Override + public IndexFieldData.Builder fielddataBuilder() { + failIfNoDocValues(); + return new DocValuesIndexFieldData.Builder(); + } + + @Override + protected BytesRef indexedValueForSearch(Object value) { + if (value == null) { + return null; + } + if (value instanceof BytesRef) { + value = ((BytesRef) value).utf8ToString(); + } + + if (collator != null) { + RawCollationKey key = collator.getRawCollationKey(value.toString(), null); + return new BytesRef(key.bytes, 0, key.size); + } else { + throw new IllegalStateException("collator is null"); + } + } + + @Override + public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int maxExpansions, + boolean transpositions) { + throw new UnsupportedOperationException(); + } + + @Override + public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) { + throw new UnsupportedOperationException(); + } + + @Override + public Query regexpQuery(String value, int flags, int maxDeterminizedStates, + MultiTermQuery.RewriteMethod method, QueryShardContext context) { + throw new UnsupportedOperationException(); + } + + public static DocValueFormat COLLATE_FORMAT = new DocValueFormat() { + @Override + public String getWriteableName() { + return "collate"; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + } + + @Override + public String format(long value) { + throw new UnsupportedOperationException(); + } + + @Override + public String format(double value) { + throw new UnsupportedOperationException(); + } + + @Override + public String format(BytesRef value) { + int encodedLength = IndexableBinaryStringTools.getEncodedLength(value.bytes, value.offset, value.length); + char[] encoded = new char[encodedLength]; + IndexableBinaryStringTools.encode(value.bytes, value.offset, value.length, encoded, 0, encodedLength); + return new String(encoded, 0, encodedLength); + } + + @Override + public long parseLong(String value, boolean roundUp, LongSupplier now) { + throw new UnsupportedOperationException(); + } + + @Override + public double parseDouble(String value, boolean roundUp, LongSupplier now) { + throw new UnsupportedOperationException(); + } + + @Override + public BytesRef parseBytesRef(String value) { + char[] encoded = value.toCharArray(); + int decodedLength = IndexableBinaryStringTools.getDecodedLength(encoded, 0, encoded.length); + byte[] decoded = new byte[decodedLength]; + IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0, decodedLength); + return new BytesRef(decoded); + } + }; + + @Override + public DocValueFormat docValueFormat(final String format, final DateTimeZone timeZone) { + return COLLATE_FORMAT; + } + } + + public static class Builder extends FieldMapper.Builder { + private String rules = null; + private String language = null; + private String country = null; + private String variant = null; + private String strength = null; + private String decomposition = null; + private String alternate = null; + private boolean caseLevel = false; + private String caseFirst = null; + private boolean numeric = false; + private String variableTop = null; + private boolean hiraganaQuaternaryMode = false; + private String nullValue = Defaults.NULL_VALUE; + + public Builder(String name) { + super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); + builder = this; + } + + @Override + public CollationFieldType fieldType() { + return (CollationFieldType) super.fieldType(); + } + + @Override + public Builder indexOptions(IndexOptions indexOptions) { + if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) > 0) { + throw new IllegalArgumentException("The [" + CONTENT_TYPE + "] field does not support positions, got [index_options]=" + + indexOptionToString(indexOptions)); + } + + return super.indexOptions(indexOptions); + } + + public String rules() { + return rules; + } + + public Builder rules(final String rules) { + this.rules = rules; + return this; + } + + public String language() { + return language; + } + + public Builder language(final String language) { + this.language = language; + return this; + } + + public String country() { + return country; + } + + public Builder country(final String country) { + this.country = country; + return this; + } + + public String variant() { + return variant; + } + + public Builder variant(final String variant) { + this.variant = variant; + return this; + } + + public String strength() { + return strength; + } + + public Builder strength(final String strength) { + this.strength = strength; + return this; + } + + public String decomposition() { + return decomposition; + } + + public Builder decomposition(final String decomposition) { + this.decomposition = decomposition; + return this; + } + + public String alternate() { + return alternate; + } + + public Builder alternate(final String alternate) { + this.alternate = alternate; + return this; + } + + public boolean caseLevel() { + return caseLevel; + } + + public Builder caseLevel(final boolean caseLevel) { + this.caseLevel = caseLevel; + return this; + } + + public String caseFirst() { + return caseFirst; + } + + public Builder caseFirst(final String caseFirst) { + this.caseFirst = caseFirst; + return this; + } + + public boolean numeric() { + return numeric; + } + + public Builder numeric(final boolean numeric) { + this.numeric = numeric; + return this; + } + + public String variableTop() { + return variableTop; + } + + public Builder variableTop(final String variableTop) { + this.variableTop = variableTop; + return this; + } + + public boolean hiraganaQuaternaryMode() { + return hiraganaQuaternaryMode; + } + + public Builder hiraganaQuaternaryMode(final boolean hiraganaQuaternaryMode) { + this.hiraganaQuaternaryMode = hiraganaQuaternaryMode; + return this; + } + + public Collator buildCollator() { + Collator collator; + if (rules != null) { + try { + collator = new RuleBasedCollator(rules); + } catch (Exception e) { + throw new IllegalArgumentException("Failed to parse collation rules", e); + } + } else { + if (language != null) { + ULocale locale; + if (country != null) { + if (variant != null) { + locale = new ULocale(language, country, variant); + } else { + locale = new ULocale(language, country); + } + } else { + locale = new ULocale(language); + } + collator = Collator.getInstance(locale); + } else { + collator = Collator.getInstance(); + } + } + + // set the strength flag, otherwise it will be the default. + if (strength != null) { + if (strength.equalsIgnoreCase("primary")) { + collator.setStrength(Collator.PRIMARY); + } else if (strength.equalsIgnoreCase("secondary")) { + collator.setStrength(Collator.SECONDARY); + } else if (strength.equalsIgnoreCase("tertiary")) { + collator.setStrength(Collator.TERTIARY); + } else if (strength.equalsIgnoreCase("quaternary")) { + collator.setStrength(Collator.QUATERNARY); + } else if (strength.equalsIgnoreCase("identical")) { + collator.setStrength(Collator.IDENTICAL); + } else { + throw new IllegalArgumentException("Invalid strength: " + strength); + } + } + + // set the decomposition flag, otherwise it will be the default. + if (decomposition != null) { + if (decomposition.equalsIgnoreCase("no")) { + collator.setDecomposition(Collator.NO_DECOMPOSITION); + } else if (decomposition.equalsIgnoreCase("canonical")) { + collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); + } else { + throw new IllegalArgumentException("Invalid decomposition: " + decomposition); + } + } + + // expert options: concrete subclasses are always a RuleBasedCollator + RuleBasedCollator rbc = (RuleBasedCollator) collator; + if (alternate != null) { + if (alternate.equalsIgnoreCase("shifted")) { + rbc.setAlternateHandlingShifted(true); + } else if (alternate.equalsIgnoreCase("non-ignorable")) { + rbc.setAlternateHandlingShifted(false); + } else { + throw new IllegalArgumentException("Invalid alternate: " + alternate); + } + } + + if (caseLevel) { + rbc.setCaseLevel(true); + } + + if (caseFirst != null) { + if (caseFirst.equalsIgnoreCase("lower")) { + rbc.setLowerCaseFirst(true); + } else if (caseFirst.equalsIgnoreCase("upper")) { + rbc.setUpperCaseFirst(true); + } else { + throw new IllegalArgumentException("Invalid caseFirst: " + caseFirst); + } + } + + if (numeric) { + rbc.setNumericCollation(true); + } + + if (variableTop != null) { + rbc.setVariableTop(variableTop); + } + + if (hiraganaQuaternaryMode) { + rbc.setHiraganaQuaternary(true); + } + + // freeze so thread-safe + return collator.freeze(); + } + + @Override + public ICUCollationKeywordFieldMapper build(BuilderContext context) { + final Collator collator = buildCollator(); + fieldType().setCollator(collator); + setupFieldType(context); + return new ICUCollationKeywordFieldMapper(name, fieldType, defaultFieldType, context.indexSettings(), + multiFieldsBuilder.build(this, context), copyTo, rules, language, country, variant, strength, decomposition, + alternate, caseLevel, caseFirst, numeric, variableTop, hiraganaQuaternaryMode, collator); + } + } + + public static class TypeParser implements Mapper.TypeParser { + @Override + public Mapper.Builder parse(String name, Map node, ParserContext parserContext) + throws MapperParsingException { + Builder builder = new Builder(name); + TypeParsers.parseField(builder, name, node, parserContext); + for (Iterator> iterator = node.entrySet().iterator(); iterator.hasNext(); ) { + Map.Entry entry = iterator.next(); + String fieldName = entry.getKey(); + Object fieldNode = entry.getValue(); + switch (fieldName) { + case "null_value": + if (fieldNode == null) { + throw new MapperParsingException("Property [null_value] cannot be null."); + } + builder.nullValue(fieldNode.toString()); + iterator.remove(); + break; + case "norms": + builder.omitNorms(!XContentMapValues.nodeBooleanValue(fieldNode, "norms")); + iterator.remove(); + break; + case "rules": + builder.rules(XContentMapValues.nodeStringValue(fieldNode, null)); + iterator.remove(); + break; + case "language": + builder.language(XContentMapValues.nodeStringValue(fieldNode, null)); + iterator.remove(); + break; + case "country": + builder.country(XContentMapValues.nodeStringValue(fieldNode, null)); + iterator.remove(); + break; + case "variant": + builder.variant(XContentMapValues.nodeStringValue(fieldNode, null)); + iterator.remove(); + break; + case "strength": + builder.strength(XContentMapValues.nodeStringValue(fieldNode, null)); + iterator.remove(); + break; + case "decomposition": + builder.decomposition(XContentMapValues.nodeStringValue(fieldNode, null)); + iterator.remove(); + break; + case "alternate": + builder.alternate(XContentMapValues.nodeStringValue(fieldNode, null)); + iterator.remove(); + break; + case "case_level": + builder.caseLevel(XContentMapValues.nodeBooleanValue(fieldNode, false)); + iterator.remove(); + break; + case "case_first": + builder.caseFirst(XContentMapValues.nodeStringValue(fieldNode, null)); + iterator.remove(); + break; + case "numeric": + builder.numeric(XContentMapValues.nodeBooleanValue(fieldNode, false)); + iterator.remove(); + break; + case "variable_top": + builder.variableTop(XContentMapValues.nodeStringValue(fieldNode, null)); + iterator.remove(); + break; + case "hiragana_quaternary_mode": + builder.hiraganaQuaternaryMode(XContentMapValues.nodeBooleanValue(fieldNode, false)); + iterator.remove(); + break; + default: + break; + } + } + + return builder; + } + } + + private final String rules; + private final String language; + private final String country; + private final String variant; + private final String strength; + private final String decomposition; + private final String alternate; + private final boolean caseLevel; + private final String caseFirst; + private final boolean numeric; + private final String variableTop; + private final boolean hiraganaQuaternaryMode; + private final Collator collator; + + protected ICUCollationKeywordFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, + Settings indexSettings, MultiFields multiFields, CopyTo copyTo, String rules, String language, + String country, String variant, + String strength, String decomposition, String alternate, boolean caseLevel, String caseFirst, + boolean numeric, String variableTop, boolean hiraganaQuaternaryMode, Collator collator) { + super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); + assert collator.isFrozen(); + this.rules = rules; + this.language = language; + this.country = country; + this.variant = variant; + this.strength = strength; + this.decomposition = decomposition; + this.alternate = alternate; + this.caseLevel = caseLevel; + this.caseFirst = caseFirst; + this.numeric = numeric; + this.variableTop = variableTop; + this.hiraganaQuaternaryMode = hiraganaQuaternaryMode; + this.collator = collator; + } + + @Override + public CollationFieldType fieldType() { + return (CollationFieldType) super.fieldType(); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + @Override + protected void doMerge(Mapper mergeWith, boolean updateAllTypes) { + super.doMerge(mergeWith, updateAllTypes); + + List conflicts = new ArrayList<>(); + ICUCollationKeywordFieldMapper icuMergeWith = (ICUCollationKeywordFieldMapper) mergeWith; + + if (!Objects.equals(rules, icuMergeWith.rules)) { + conflicts.add("Cannot update rules setting for [" + CONTENT_TYPE + "]"); + } + + if (!Objects.equals(language, icuMergeWith.language)) { + conflicts.add("Cannot update language setting for [" + CONTENT_TYPE + "]"); + } + + if (!Objects.equals(country, icuMergeWith.country)) { + conflicts.add("Cannot update country setting for [" + CONTENT_TYPE + "]"); + } + + if (!Objects.equals(variant, icuMergeWith.variant)) { + conflicts.add("Cannot update variant setting for [" + CONTENT_TYPE + "]"); + } + + if (!Objects.equals(strength, icuMergeWith.strength)) { + conflicts.add("Cannot update strength setting for [" + CONTENT_TYPE + "]"); + } + + if (!Objects.equals(decomposition, icuMergeWith.decomposition)) { + conflicts.add("Cannot update decomposition setting for [" + CONTENT_TYPE + "]"); + } + + if (!Objects.equals(alternate, icuMergeWith.alternate)) { + conflicts.add("Cannot update alternate setting for [" + CONTENT_TYPE + "]"); + } + + if (caseLevel != icuMergeWith.caseLevel) { + conflicts.add("Cannot update case_level setting for [" + CONTENT_TYPE + "]"); + } + + if (!Objects.equals(caseFirst, icuMergeWith.caseFirst)) { + conflicts.add("Cannot update case_first setting for [" + CONTENT_TYPE + "]"); + } + + if (numeric != icuMergeWith.numeric) { + conflicts.add("Cannot update numeric setting for [" + CONTENT_TYPE + "]"); + } + + if (!Objects.equals(variableTop, icuMergeWith.variableTop)) { + conflicts.add("Cannot update variable_top setting for [" + CONTENT_TYPE + "]"); + } + + if (hiraganaQuaternaryMode != icuMergeWith.hiraganaQuaternaryMode) { + conflicts.add("Cannot update hiragana_quaternary_mode setting for [" + CONTENT_TYPE + "]"); + } + + if (!conflicts.isEmpty()) { + throw new IllegalArgumentException("Can't merge because of conflicts: " + conflicts); + } + } + + @Override + protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException { + super.doXContentBody(builder, includeDefaults, params); + + if (includeDefaults || fieldType().nullValue() != null) { + builder.field("null_value", fieldType().nullValue()); + } + + if (includeDefaults || rules != null) { + builder.field("rules", rules); + } + + if (includeDefaults || language != null) { + builder.field("language", language); + } + + if (includeDefaults || country != null) { + builder.field("country", country); + } + + if (includeDefaults || variant != null) { + builder.field("variant", variant); + } + + if (includeDefaults || strength != null) { + builder.field("strength", strength); + } + + if (includeDefaults || decomposition != null) { + builder.field("decomposition", decomposition); + } + + if (includeDefaults || alternate != null) { + builder.field("alternate", alternate); + } + + if (includeDefaults || caseLevel) { + builder.field("case_level", caseLevel); + } + + if (includeDefaults || caseFirst != null) { + builder.field("case_first", caseFirst); + } + + if (includeDefaults || numeric) { + builder.field("numeric", numeric); + } + + if (includeDefaults || variableTop != null) { + builder.field("variable_top", variableTop); + } + + if (includeDefaults || hiraganaQuaternaryMode) { + builder.field("hiragana_quaternary_mode", hiraganaQuaternaryMode); + } + } + + @Override + protected void parseCreateField(ParseContext context, List fields) throws IOException { + final String value; + if (context.externalValueSet()) { + value = context.externalValue().toString(); + } else { + XContentParser parser = context.parser(); + if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { + value = fieldType().nullValueAsString(); + } else { + value = parser.textOrNull(); + } + } + + if (value == null) { + return; + } + + RawCollationKey key = collator.getRawCollationKey(value, null); + final BytesRef binaryValue = new BytesRef(key.bytes, 0, key.size); + + if (fieldType().indexOptions() != IndexOptions.NONE || fieldType().stored()) { + Field field = new Field(fieldType().name(), binaryValue, fieldType()); + fields.add(field); + } + + if (fieldType().hasDocValues()) { + fields.add(new SortedDocValuesField(fieldType().name(), binaryValue)); + } + } +} diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java index 059dabb4f46..58ebdc8e2a8 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java @@ -19,6 +19,9 @@ package org.elasticsearch.plugin.analysis.icu; +import static java.util.Collections.singletonMap; + +import org.elasticsearch.common.io.stream.NamedWriteableRegistry; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.IcuCollationTokenFilterFactory; import org.elasticsearch.index.analysis.IcuFoldingTokenFilterFactory; @@ -28,16 +31,20 @@ import org.elasticsearch.index.analysis.IcuTokenizerFactory; import org.elasticsearch.index.analysis.IcuTransformTokenFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; +import org.elasticsearch.index.mapper.ICUCollationKeywordFieldMapper; +import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.plugins.AnalysisPlugin; +import org.elasticsearch.plugins.MapperPlugin; import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.search.DocValueFormat; +import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; -import static java.util.Collections.singletonMap; - -public class AnalysisICUPlugin extends Plugin implements AnalysisPlugin { +public class AnalysisICUPlugin extends Plugin implements AnalysisPlugin, MapperPlugin { @Override public Map> getCharFilters() { return singletonMap("icu_normalizer", IcuNormalizerCharFilterFactory::new); @@ -57,4 +64,20 @@ public class AnalysisICUPlugin extends Plugin implements AnalysisPlugin { public Map> getTokenizers() { return singletonMap("icu_tokenizer", IcuTokenizerFactory::new); } + + @Override + public Map getMappers() { + return Collections.singletonMap(ICUCollationKeywordFieldMapper.CONTENT_TYPE, new ICUCollationKeywordFieldMapper.TypeParser()); + } + + @Override + public List getNamedWriteables() { + return Collections.singletonList( + new NamedWriteableRegistry.Entry( + DocValueFormat.class, + ICUCollationKeywordFieldMapper.CollationFieldType.COLLATE_FORMAT.getWriteableName(), + in -> ICUCollationKeywordFieldMapper.CollationFieldType.COLLATE_FORMAT + ) + ); + } } diff --git a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/CollationFieldTypeTests.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/CollationFieldTypeTests.java new file mode 100644 index 00000000000..94634fc79c8 --- /dev/null +++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/CollationFieldTypeTests.java @@ -0,0 +1,145 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.mapper; + +import com.carrotsearch.randomizedtesting.generators.RandomStrings; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RawCollationKey; +import com.ibm.icu.util.ULocale; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.TermInSetQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.unit.Fuzziness; +import org.elasticsearch.index.mapper.ICUCollationKeywordFieldMapper.CollationFieldType; +import org.elasticsearch.index.mapper.MappedFieldType.Relation; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class CollationFieldTypeTests extends FieldTypeTestCase { + @Override + protected MappedFieldType createDefaultFieldType() { + return new CollationFieldType(); + } + + public void testIsFieldWithinQuery() throws IOException { + CollationFieldType ft = new CollationFieldType(); + // current impl ignores args and shourd always return INTERSECTS + assertEquals(Relation.INTERSECTS, ft.isFieldWithinQuery(null, + RandomStrings.randomAsciiOfLengthBetween(random(), 0, 5), + RandomStrings.randomAsciiOfLengthBetween(random(), 0, 5), + randomBoolean(), randomBoolean(), null, null, null)); + } + + public void testTermQuery() { + MappedFieldType ft = createDefaultFieldType(); + ft.setName("field"); + ft.setIndexOptions(IndexOptions.DOCS); + + Collator collator = Collator.getInstance(new ULocale("tr")); + collator.setStrength(Collator.PRIMARY); + collator.freeze(); + ((CollationFieldType) ft).setCollator(collator); + + RawCollationKey key = collator.getRawCollationKey("ı will use turkish casıng", null); + BytesRef expected = new BytesRef(key.bytes, 0, key.size); + + assertEquals(new TermQuery(new Term("field", expected)), ft.termQuery("I WİLL USE TURKİSH CASING", null)); + + ft.setIndexOptions(IndexOptions.NONE); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> ft.termQuery("bar", null)); + assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); + } + + public void testTermsQuery() { + MappedFieldType ft = createDefaultFieldType(); + ft.setName("field"); + ft.setIndexOptions(IndexOptions.DOCS); + + Collator collator = Collator.getInstance().freeze(); + ((CollationFieldType) ft).setCollator(collator); + + RawCollationKey fooKey = collator.getRawCollationKey("foo", null); + RawCollationKey barKey = collator.getRawCollationKey("bar", null); + + List terms = new ArrayList<>(); + terms.add(new BytesRef(fooKey.bytes, 0, fooKey.size)); + terms.add(new BytesRef(barKey.bytes, 0, barKey.size)); + + assertEquals(new TermInSetQuery("field", terms), + ft.termsQuery(Arrays.asList("foo", "bar"), null)); + + ft.setIndexOptions(IndexOptions.NONE); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> ft.termsQuery(Arrays.asList("foo", "bar"), null)); + assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); + } + + public void testRegexpQuery() { + MappedFieldType ft = createDefaultFieldType(); + ft.setName("field"); + ft.setIndexOptions(IndexOptions.DOCS); + expectThrows(UnsupportedOperationException.class, + () -> ft.regexpQuery("foo.*", 0, 10, null, null)); + } + + public void testFuzzyQuery() { + MappedFieldType ft = createDefaultFieldType(); + ft.setName("field"); + ft.setIndexOptions(IndexOptions.DOCS); + expectThrows(UnsupportedOperationException.class, + () -> ft.fuzzyQuery("foo", Fuzziness.fromEdits(2), 1, 50, true)); + } + + public void testPrefixQuery() { + MappedFieldType ft = createDefaultFieldType(); + ft.setName("field"); + ft.setIndexOptions(IndexOptions.DOCS); + expectThrows(UnsupportedOperationException.class, + () -> ft.prefixQuery("prefix", null, null)); + } + + public void testRangeQuery() { + MappedFieldType ft = createDefaultFieldType(); + ft.setName("field"); + ft.setIndexOptions(IndexOptions.DOCS); + + Collator collator = Collator.getInstance().freeze(); + ((CollationFieldType) ft).setCollator(collator); + + RawCollationKey aKey = collator.getRawCollationKey("a", null); + RawCollationKey bKey = collator.getRawCollationKey("b", null); + + TermRangeQuery expected = new TermRangeQuery("field", new BytesRef(aKey.bytes, 0, aKey.size), + new BytesRef(bKey.bytes, 0, bKey.size), false, false); + + assertEquals(expected, ft.rangeQuery("a", "b", false, false, null)); + + ft.setIndexOptions(IndexOptions.NONE); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> ft.rangeQuery("a", "b", false, false, null)); + assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage()); + } +} diff --git a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapperIT.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapperIT.java new file mode 100644 index 00000000000..8a6e9b49ac9 --- /dev/null +++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapperIT.java @@ -0,0 +1,443 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.mapper; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertOrderedSearchHits; + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.util.ULocale; +import org.elasticsearch.action.search.SearchRequest; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentType; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.elasticsearch.search.sort.SortOrder; +import org.elasticsearch.test.ESIntegTestCase; + +import java.util.Collection; +import java.util.Collections; + +public class ICUCollationKeywordFieldMapperIT extends ESIntegTestCase { + + @Override + protected Collection> nodePlugins() { + return Collections.singletonList(AnalysisICUPlugin.class); + } + + /* + * Turkish has some funny casing. + * This test shows how you can solve this kind of thing easily with collation. + * Instead of using LowerCaseFilter, use a turkish collator with primary strength. + * Then things will sort and match correctly. + */ + public void testBasicUsage() throws Exception { + String index = "foo"; + String type = "mytype"; + + String[] equilavent = {"I WİLL USE TURKİSH CASING", "ı will use turkish casıng"}; + + XContentBuilder builder = jsonBuilder() + .startObject().startObject("properties") + .startObject("collate") + .field("type", "icu_collation_keyword") + .field("language", "tr") + .field("strength", "primary") + .endObject() + .endObject().endObject(); + + assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); + + // both values should collate to same value + indexRandom(true, + client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON), + client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON) + ); + + // searching for either of the terms should return both results since they collate to the same value + SearchRequest request = new SearchRequest() + .indices(index) + .types(type) + .source(new SearchSourceBuilder() + .fetchSource(false) + .query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1])) + .sort("collate") + .sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value + ); + + SearchResponse response = client().search(request).actionGet(); + assertNoFailures(response); + assertHitCount(response, 2L); + assertOrderedSearchHits(response, "2", "1"); + } + + /* + * Test usage of the decomposition option for unicode normalization. + */ + public void testNormalization() throws Exception { + String index = "foo"; + String type = "mytype"; + + String[] equilavent = {"I W\u0049\u0307LL USE TURKİSH CASING", "ı will use turkish casıng"}; + + XContentBuilder builder = jsonBuilder() + .startObject().startObject("properties") + .startObject("collate") + .field("type", "icu_collation_keyword") + .field("language", "tr") + .field("strength", "primary") + .field("decomposition", "canonical") + .endObject() + .endObject().endObject(); + + assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); + + indexRandom(true, + client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON), + client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON) + ); + + // searching for either of the terms should return both results since they collate to the same value + SearchRequest request = new SearchRequest() + .indices(index) + .types(type) + .source(new SearchSourceBuilder() + .fetchSource(false) + .query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1])) + .sort("collate") + .sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value + ); + + SearchResponse response = client().search(request).actionGet(); + assertNoFailures(response); + assertHitCount(response, 2L); + assertOrderedSearchHits(response, "2", "1"); + } + + /* + * Test secondary strength, for english case is not significant. + */ + public void testSecondaryStrength() throws Exception { + String index = "foo"; + String type = "mytype"; + + String[] equilavent = {"TESTING", "testing"}; + + XContentBuilder builder = jsonBuilder() + .startObject().startObject("properties") + .startObject("collate") + .field("type", "icu_collation_keyword") + .field("language", "en") + .field("strength", "secondary") + .field("decomposition", "no") + .endObject() + .endObject().endObject(); + + assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); + + indexRandom(true, + client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON), + client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON) + ); + + SearchRequest request = new SearchRequest() + .indices(index) + .types(type) + .source(new SearchSourceBuilder() + .fetchSource(false) + .query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1])) + .sort("collate") + .sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value + ); + + SearchResponse response = client().search(request).actionGet(); + assertNoFailures(response); + assertHitCount(response, 2L); + assertOrderedSearchHits(response, "2", "1"); + } + + /* + * Setting alternate=shifted to shift whitespace, punctuation and symbols + * to quaternary level + */ + public void testIgnorePunctuation() throws Exception { + String index = "foo"; + String type = "mytype"; + + String[] equilavent = {"foo-bar", "foo bar"}; + + XContentBuilder builder = jsonBuilder() + .startObject().startObject("properties") + .startObject("collate") + .field("type", "icu_collation_keyword") + .field("language", "en") + .field("strength", "primary") + .field("alternate", "shifted") + .endObject() + .endObject().endObject(); + + assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); + + indexRandom(true, + client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON), + client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON) + ); + + SearchRequest request = new SearchRequest() + .indices(index) + .types(type) + .source(new SearchSourceBuilder() + .fetchSource(false) + .query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1])) + .sort("collate") + .sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value + ); + + SearchResponse response = client().search(request).actionGet(); + assertNoFailures(response); + assertHitCount(response, 2L); + assertOrderedSearchHits(response, "2", "1"); + } + + /* + * Setting alternate=shifted and variableTop to shift whitespace, but not + * punctuation or symbols, to quaternary level + */ + public void testIgnoreWhitespace() throws Exception { + String index = "foo"; + String type = "mytype"; + + XContentBuilder builder = jsonBuilder() + .startObject().startObject("properties") + .startObject("collate") + .field("type", "icu_collation_keyword") + .field("language", "en") + .field("strength", "primary") + .field("alternate", "shifted") + .field("variable_top", " ") + .field("index", false) + .endObject() + .endObject().endObject(); + + assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); + + indexRandom(true, + client().prepareIndex(index, type, "1").setSource("{\"collate\":\"foo bar\"}", XContentType.JSON), + client().prepareIndex(index, type, "2").setSource("{\"collate\":\"foobar\"}", XContentType.JSON), + client().prepareIndex(index, type, "3").setSource("{\"collate\":\"foo-bar\"}", XContentType.JSON) + ); + + SearchRequest request = new SearchRequest() + .indices(index) + .types(type) + .source(new SearchSourceBuilder() + .fetchSource(false) + .sort("collate", SortOrder.ASC) + .sort("_uid", SortOrder.ASC) // secondary sort should kick in on docs 1 and 3 because same value collate value + ); + + SearchResponse response = client().search(request).actionGet(); + assertNoFailures(response); + assertHitCount(response, 3L); + assertOrderedSearchHits(response, "3", "1", "2"); + } + + /* + * Setting numeric to encode digits with numeric value, so that + * foobar-9 sorts before foobar-10 + */ + public void testNumerics() throws Exception { + String index = "foo"; + String type = "mytype"; + + XContentBuilder builder = jsonBuilder() + .startObject().startObject("properties") + .startObject("collate") + .field("type", "icu_collation_keyword") + .field("language", "en") + .field("numeric", true) + .field("index", false) + .endObject() + .endObject().endObject(); + + assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); + + indexRandom(true, + client().prepareIndex(index, type, "1").setSource("{\"collate\":\"foobar-10\"}", XContentType.JSON), + client().prepareIndex(index, type, "2").setSource("{\"collate\":\"foobar-9\"}", XContentType.JSON) + ); + + SearchRequest request = new SearchRequest() + .indices(index) + .types(type) + .source(new SearchSourceBuilder() + .fetchSource(false) + .sort("collate", SortOrder.ASC) + ); + + SearchResponse response = client().search(request).actionGet(); + assertNoFailures(response); + assertHitCount(response, 2L); + assertOrderedSearchHits(response, "2", "1"); + } + + /* + * Setting caseLevel=true to create an additional case level between + * secondary and tertiary + */ + public void testIgnoreAccentsButNotCase() throws Exception { + String index = "foo"; + String type = "mytype"; + + XContentBuilder builder = jsonBuilder() + .startObject().startObject("properties") + .startObject("collate") + .field("type", "icu_collation_keyword") + .field("language", "en") + .field("strength", "primary") + .field("case_level", true) + .field("index", false) + .endObject() + .endObject().endObject(); + + assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); + + indexRandom(true, + client().prepareIndex(index, type, "1").setSource("{\"collate\":\"résumé\"}", XContentType.JSON), + client().prepareIndex(index, type, "2").setSource("{\"collate\":\"Resume\"}", XContentType.JSON), + client().prepareIndex(index, type, "3").setSource("{\"collate\":\"resume\"}", XContentType.JSON), + client().prepareIndex(index, type, "4").setSource("{\"collate\":\"Résumé\"}", XContentType.JSON) + ); + + SearchRequest request = new SearchRequest() + .indices(index) + .types(type) + .source(new SearchSourceBuilder() + .fetchSource(false) + .sort("collate", SortOrder.ASC) + .sort("_uid", SortOrder.DESC) + ); + + SearchResponse response = client().search(request).actionGet(); + assertNoFailures(response); + assertHitCount(response, 4L); + assertOrderedSearchHits(response, "3", "1", "4", "2"); + } + + /* + * Setting caseFirst=upper to cause uppercase strings to sort + * before lowercase ones. + */ + public void testUpperCaseFirst() throws Exception { + String index = "foo"; + String type = "mytype"; + + XContentBuilder builder = jsonBuilder() + .startObject().startObject("properties") + .startObject("collate") + .field("type", "icu_collation_keyword") + .field("language", "en") + .field("strength", "tertiary") + .field("case_first", "upper") + .field("index", false) + .endObject() + .endObject().endObject(); + + assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); + + indexRandom(true, + client().prepareIndex(index, type, "1").setSource("{\"collate\":\"resume\"}", XContentType.JSON), + client().prepareIndex(index, type, "2").setSource("{\"collate\":\"Resume\"}", XContentType.JSON) + ); + + SearchRequest request = new SearchRequest() + .indices(index) + .types(type) + .source(new SearchSourceBuilder() + .fetchSource(false) + .sort("collate", SortOrder.ASC) + ); + + SearchResponse response = client().search(request).actionGet(); + assertNoFailures(response); + assertHitCount(response, 2L); + assertOrderedSearchHits(response, "2", "1"); + } + + /* + * For german, you might want oe to sort and match with o umlaut. + * This is not the default, but you can make a customized ruleset to do this. + * + * The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior. + * http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383 + */ + public void testCustomRules() throws Exception { + String index = "foo"; + String type = "mytype"; + + RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE")); + String DIN5007_2_tailorings = + "& ae , a\u0308 & AE , A\u0308" + + "& oe , o\u0308 & OE , O\u0308" + + "& ue , u\u0308 & UE , u\u0308"; + + RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); + String tailoredRules = tailoredCollator.getRules(); + + String[] equilavent = {"Töne", "Toene"}; + + XContentBuilder builder = jsonBuilder() + .startObject().startObject("properties") + .startObject("collate") + .field("type", "icu_collation_keyword") + .field("rules", tailoredRules) + .field("strength", "primary") + .endObject() + .endObject().endObject(); + + assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder)); + + indexRandom(true, + client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON), + client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON) + ); + + SearchRequest request = new SearchRequest() + .indices(index) + .types(type) + .source(new SearchSourceBuilder() + .fetchSource(false) + .query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1])) + .sort("collate", SortOrder.ASC) + .sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value + ); + + SearchResponse response = client().search(request).actionGet(); + assertNoFailures(response); + assertHitCount(response, 2L); + assertOrderedSearchHits(response, "2", "1"); + } +} diff --git a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapperTests.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapperTests.java new file mode 100644 index 00000000000..ebe909837e9 --- /dev/null +++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapperTests.java @@ -0,0 +1,342 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.mapper; + +import static org.hamcrest.Matchers.equalTo; + +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RawCollationKey; +import com.ibm.icu.util.ULocale; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.IndexableFieldType; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.compress.CompressedXContent; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.common.xcontent.XContentType; +import org.elasticsearch.index.IndexService; +import org.elasticsearch.index.mapper.MapperService.MergeReason; +import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.test.ESSingleNodeTestCase; +import org.elasticsearch.test.InternalSettingsPlugin; +import org.junit.Before; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; + +public class ICUCollationKeywordFieldMapperTests extends ESSingleNodeTestCase { + + private static final String FIELD_TYPE = "icu_collation_keyword"; + + @Override + protected Collection> getPlugins() { + return Arrays.asList(AnalysisICUPlugin.class, InternalSettingsPlugin.class); + } + + IndexService indexService; + DocumentMapperParser parser; + + @Before + public void setup() { + indexService = createIndex("test"); + parser = indexService.mapperService().documentMapperParser(); + } + + public void testDefaults() throws Exception { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", FIELD_TYPE).endObject().endObject() + .endObject().endObject().string(); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + + assertEquals(mapping, mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder() + .startObject() + .field("field", "1234") + .endObject() + .bytes(), + XContentType.JSON)); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + + Collator collator = Collator.getInstance(); + RawCollationKey key = collator.getRawCollationKey("1234", null); + BytesRef expected = new BytesRef(key.bytes, 0, key.size); + + assertEquals(expected, fields[0].binaryValue()); + IndexableFieldType fieldType = fields[0].fieldType(); + assertThat(fieldType.omitNorms(), equalTo(true)); + assertFalse(fieldType.tokenized()); + assertFalse(fieldType.stored()); + assertThat(fieldType.indexOptions(), equalTo(IndexOptions.DOCS)); + assertThat(fieldType.storeTermVectors(), equalTo(false)); + assertThat(fieldType.storeTermVectorOffsets(), equalTo(false)); + assertThat(fieldType.storeTermVectorPositions(), equalTo(false)); + assertThat(fieldType.storeTermVectorPayloads(), equalTo(false)); + assertEquals(DocValuesType.NONE, fieldType.docValuesType()); + + assertEquals(expected, fields[1].binaryValue()); + fieldType = fields[1].fieldType(); + assertThat(fieldType.indexOptions(), equalTo(IndexOptions.NONE)); + assertEquals(DocValuesType.SORTED, fieldType.docValuesType()); + } + + public void testNullValue() throws IOException { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", FIELD_TYPE).endObject().endObject() + .endObject().endObject().string(); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + assertEquals(mapping, mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder() + .startObject() + .nullField("field") + .endObject() + .bytes(), + XContentType.JSON)); + assertArrayEquals(new IndexableField[0], doc.rootDoc().getFields("field")); + + mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", FIELD_TYPE) + .field("null_value", "1234").endObject().endObject() + .endObject().endObject().string(); + + mapper = parser.parse("type", new CompressedXContent(mapping)); + + assertEquals(mapping, mapper.mappingSource().toString()); + + doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder() + .startObject() + .endObject() + .bytes(), + XContentType.JSON)); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(0, fields.length); + + doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder() + .startObject() + .nullField("field") + .endObject() + .bytes(), + XContentType.JSON)); + + Collator collator = Collator.getInstance(); + RawCollationKey key = collator.getRawCollationKey("1234", null); + BytesRef expected = new BytesRef(key.bytes, 0, key.size); + + fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + assertEquals(expected, fields[0].binaryValue()); + } + + public void testEnableStore() throws IOException { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", FIELD_TYPE) + .field("store", true).endObject().endObject() + .endObject().endObject().string(); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + + assertEquals(mapping, mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder() + .startObject() + .field("field", "1234") + .endObject() + .bytes(), + XContentType.JSON)); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + assertTrue(fields[0].fieldType().stored()); + } + + public void testDisableIndex() throws IOException { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", FIELD_TYPE) + .field("index", false).endObject().endObject() + .endObject().endObject().string(); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + + assertEquals(mapping, mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder() + .startObject() + .field("field", "1234") + .endObject() + .bytes(), + XContentType.JSON)); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(1, fields.length); + assertEquals(IndexOptions.NONE, fields[0].fieldType().indexOptions()); + assertEquals(DocValuesType.SORTED, fields[0].fieldType().docValuesType()); + } + + public void testDisableDocValues() throws IOException { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", FIELD_TYPE) + .field("doc_values", false).endObject().endObject() + .endObject().endObject().string(); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + + assertEquals(mapping, mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder() + .startObject() + .field("field", "1234") + .endObject() + .bytes(), + XContentType.JSON)); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(1, fields.length); + assertEquals(DocValuesType.NONE, fields[0].fieldType().docValuesType()); + } + + public void testIndexOptions() throws IOException { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", FIELD_TYPE) + .field("index_options", "freqs").endObject().endObject() + .endObject().endObject().string(); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + + assertEquals(mapping, mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder() + .startObject() + .field("field", "1234") + .endObject() + .bytes(), + XContentType.JSON)); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + assertEquals(IndexOptions.DOCS_AND_FREQS, fields[0].fieldType().indexOptions()); + + for (String indexOptions : Arrays.asList("positions", "offsets")) { + final String mapping2 = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", FIELD_TYPE) + .field("index_options", indexOptions).endObject().endObject() + .endObject().endObject().string(); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> parser.parse("type", new CompressedXContent(mapping2))); + assertEquals("The [" + FIELD_TYPE + "] field does not support positions, got [index_options]=" + indexOptions, + e.getMessage()); + } + } + + public void testEnableNorms() throws IOException { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field").field("type", FIELD_TYPE) + .field("norms", true).endObject().endObject() + .endObject().endObject().string(); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + + assertEquals(mapping, mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder() + .startObject() + .field("field", "1234") + .endObject() + .bytes(), + XContentType.JSON)); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + assertFalse(fields[0].fieldType().omitNorms()); + } + + public void testCollator() throws IOException { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field") + .field("type", FIELD_TYPE) + .field("language", "tr") + .field("strength", "primary") + .endObject().endObject().endObject().endObject().string(); + + DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping)); + + assertEquals(mapping, mapper.mappingSource().toString()); + + ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder() + .startObject() + .field("field", "I WİLL USE TURKİSH CASING") + .endObject() + .bytes(), + XContentType.JSON)); + + Collator collator = Collator.getInstance(new ULocale("tr")); + collator.setStrength(Collator.PRIMARY); + RawCollationKey key = collator.getRawCollationKey("ı will use turkish casıng", null); // should collate to same value + BytesRef expected = new BytesRef(key.bytes, 0, key.size); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + + assertEquals(expected, fields[0].binaryValue()); + IndexableFieldType fieldType = fields[0].fieldType(); + assertThat(fieldType.omitNorms(), equalTo(true)); + assertFalse(fieldType.tokenized()); + assertFalse(fieldType.stored()); + assertThat(fieldType.indexOptions(), equalTo(IndexOptions.DOCS)); + assertThat(fieldType.storeTermVectors(), equalTo(false)); + assertThat(fieldType.storeTermVectorOffsets(), equalTo(false)); + assertThat(fieldType.storeTermVectorPositions(), equalTo(false)); + assertThat(fieldType.storeTermVectorPayloads(), equalTo(false)); + assertEquals(DocValuesType.NONE, fieldType.docValuesType()); + + assertEquals(expected, fields[1].binaryValue()); + fieldType = fields[1].fieldType(); + assertThat(fieldType.indexOptions(), equalTo(IndexOptions.NONE)); + assertEquals(DocValuesType.SORTED, fieldType.docValuesType()); + } + + public void testUpdateCollator() throws IOException { + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field") + .field("type", FIELD_TYPE) + .field("language", "tr") + .field("strength", "primary") + .endObject().endObject().endObject().endObject().string(); + indexService.mapperService().merge("type", new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE, randomBoolean()); + + String mapping2 = XContentFactory.jsonBuilder().startObject().startObject("type") + .startObject("properties").startObject("field") + .field("type", FIELD_TYPE) + .field("language", "en") + .endObject().endObject().endObject().endObject().string(); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> indexService.mapperService().merge("type", + new CompressedXContent(mapping2), MergeReason.MAPPING_UPDATE, randomBoolean())); + assertEquals("Can't merge because of conflicts: [Cannot update language setting for [" + FIELD_TYPE + + "], Cannot update strength setting for [" + FIELD_TYPE + "]]", e.getMessage()); + } +} diff --git a/core/src/test/java/org/elasticsearch/index/mapper/FieldTypeTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/FieldTypeTestCase.java similarity index 99% rename from core/src/test/java/org/elasticsearch/index/mapper/FieldTypeTestCase.java rename to test/framework/src/main/java/org/elasticsearch/index/mapper/FieldTypeTestCase.java index 5d86602c4ca..ae91a791535 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/FieldTypeTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/mapper/FieldTypeTestCase.java @@ -175,13 +175,15 @@ public abstract class FieldTypeTestCase extends ESTestCase { // TODO: remove this once toString is no longer final on FieldType... protected void assertFieldTypeEquals(String property, MappedFieldType ft1, MappedFieldType ft2) { if (ft1.equals(ft2) == false) { - fail("Expected equality, testing property " + property + "\nexpected: " + toString(ft1) + "; \nactual: " + toString(ft2) + "\n"); + fail("Expected equality, testing property " + property + "\nexpected: " + toString(ft1) + "; \nactual: " + toString(ft2) + + "\n"); } } protected void assertFieldTypeNotEquals(String property, MappedFieldType ft1, MappedFieldType ft2) { if (ft1.equals(ft2)) { - fail("Expected inequality, testing property " + property + "\nfirst: " + toString(ft1) + "; \nsecond: " + toString(ft2) + "\n"); + fail("Expected inequality, testing property " + property + "\nfirst: " + toString(ft1) + "; \nsecond: " + toString(ft2) + + "\n"); } }