Add ICUCollationFieldMapper (#24126)

Adds a new "icu_collation" field type that exposes lucene's
ICUCollationDocValuesField.  ICUCollationDocValuesField is the replacement
for ICUCollationKeyFilter which has been deprecated since Lucene 5.
This commit is contained in:
Matt Weber 2017-05-10 01:35:11 -07:00 committed by Adrien Grand
parent 3f1ef488cd
commit b24326271e
9 changed files with 1774 additions and 45 deletions

View File

@ -57,7 +57,7 @@ public abstract class StringFieldType extends TermBasedFieldType {
} }
@Override @Override
public final Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int maxExpansions, public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int maxExpansions,
boolean transpositions) { boolean transpositions) {
failIfNotIndexed(); failIfNotIndexed();
return new FuzzyQuery(new Term(name(), indexedValueForSearch(value)), return new FuzzyQuery(new Term(name(), indexedValueForSearch(value)),
@ -65,7 +65,7 @@ public abstract class StringFieldType extends TermBasedFieldType {
} }
@Override @Override
public final Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) { public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
failIfNotIndexed(); failIfNotIndexed();
PrefixQuery query = new PrefixQuery(new Term(name(), indexedValueForSearch(value))); PrefixQuery query = new PrefixQuery(new Term(name(), indexedValueForSearch(value)));
if (method != null) { if (method != null) {
@ -75,7 +75,7 @@ public abstract class StringFieldType extends TermBasedFieldType {
} }
@Override @Override
public final Query regexpQuery(String value, int flags, int maxDeterminizedStates, public Query regexpQuery(String value, int flags, int maxDeterminizedStates,
MultiTermQuery.RewriteMethod method, QueryShardContext context) { MultiTermQuery.RewriteMethod method, QueryShardContext context) {
failIfNotIndexed(); failIfNotIndexed();
RegexpQuery query = new RegexpQuery(new Term(name(), indexedValueForSearch(value)), flags, maxDeterminizedStates); RegexpQuery query = new RegexpQuery(new Term(name(), indexedValueForSearch(value)), flags, maxDeterminizedStates);

View File

@ -302,50 +302,46 @@ PUT icu_sample
-------------------------------------------------- --------------------------------------------------
// CONSOLE // CONSOLE
[[analysis-icu-collation]] [[analysis-icu-collation]]
==== ICU Collation Token Filter ==== ICU Collation Token Filter
[WARNING]
======
This token filter has been deprecated since Lucene 5.0. Please use
<<analysis-icu-collation-keyword-field, ICU Collation Keyword Field>>.
======
[[analysis-icu-collation-keyword-field]]
==== ICU Collation Keyword Field
Collations are used for sorting documents in a language-specific word order. Collations are used for sorting documents in a language-specific word order.
The `icu_collation` token filter is available to all indices and defaults to The `icu_collation_keyword` field type is available to all indices and will encode
using the the terms directly as bytes in a doc values field and a single indexed token just
{defguide}/sorting-collations.html#uca[DUCET collation], like a standard {ref}/keyword.html[Keyword Field].
Defaults to using {defguide}/sorting-collations.html#uca[DUCET collation],
which is a best-effort attempt at language-neutral sorting. which is a best-effort attempt at language-neutral sorting.
Below is an example of how to set up a field for sorting German names in Below is an example of how to set up a field for sorting German names in
``phonebook'' order: ``phonebook'' order:
[source,js] [source,js]
-------------------------------------------------- --------------------------
PUT /my_index PUT my_index
{ {
"settings": {
"analysis": {
"filter": {
"german_phonebook": {
"type": "icu_collation",
"language": "de",
"country": "DE",
"variant": "@collation=phonebook"
}
},
"analyzer": {
"german_phonebook": {
"tokenizer": "keyword",
"filter": [ "german_phonebook" ]
}
}
}
},
"mappings": { "mappings": {
"user": { "user": {
"properties": { "properties": {
"name": { <1> "name": { <1>
"type": "text", "type": "text",
"fields": { "fields": {
"sort": { <2> "sort": { <2>
"type": "text", "type": "icu_collation_keyword",
"fielddata": true, "index": false,
"analyzer": "german_phonebook" "language": "de",
"country": "DE",
"variant": "@collation=phonebook"
} }
} }
} }
@ -364,15 +360,47 @@ GET _search <3>
"sort": "name.sort" "sort": "name.sort"
} }
-------------------------------------------------- --------------------------
// CONSOLE // CONSOLE
<1> The `name` field uses the `standard` analyzer, and so support full text queries. <1> The `name` field uses the `standard` analyzer, and so support full text queries.
<2> The `name.sort` field uses the `keyword` analyzer to preserve the name as <2> The `name.sort` field is an `icu_collation_keyword` field that will preserve the name as
a single token, and applies the `german_phonebook` token filter to index a single token doc values, and applies the German ``phonebook'' order.
the value in German phonebook sort order.
<3> An example query which searches the `name` field and sorts on the `name.sort` field. <3> An example query which searches the `name` field and sorts on the `name.sort` field.
==== Parameters for ICU Collation Keyword Fields
The following parameters are accepted by `icu_collation_keyword` fields:
[horizontal]
`doc_values`::
Should the field be stored on disk in a column-stride fashion, so that it
can later be used for sorting, aggregations, or scripting? Accepts `true`
(default) or `false`.
`index`::
Should the field be searchable? Accepts `true` (default) or `false`.
`null_value`::
Accepts a string value which is substituted for any explicit `null`
values. Defaults to `null`, which means the field is treated as missing.
`store`::
Whether the field value should be stored and retrievable separately from
the {ref}/mapping-source-field.html[`_source`] field. Accepts `true` or `false`
(default).
`fields`::
Multi-fields allow the same string value to be indexed in multiple ways for
different purposes, such as one field for search and a multi-field for
sorting and aggregations.
===== Collation options ===== Collation options
`strength`:: `strength`::
@ -404,14 +432,14 @@ Possible values: `shifted` or `non-ignorable`. Sets the alternate handling for
strength `quaternary` to be either shifted or non-ignorable. Which boils down strength `quaternary` to be either shifted or non-ignorable. Which boils down
to ignoring punctuation and whitespace. to ignoring punctuation and whitespace.
`caseLevel`:: `case_level`::
Possible values: `true` or `false` (default). Whether case level sorting is Possible values: `true` or `false` (default). Whether case level sorting is
required. When strength is set to `primary` this will ignore accent required. When strength is set to `primary` this will ignore accent
differences. differences.
`caseFirst`:: `case_first`::
Possible values: `lower` or `upper`. Useful to control which case is sorted Possible values: `lower` or `upper`. Useful to control which case is sorted
first when case is not ignored for strength `tertiary`. The default depends on first when case is not ignored for strength `tertiary`. The default depends on
@ -424,11 +452,11 @@ according to their numeric representation. For example the value `egg-9` is
sorted before the value `egg-21`. sorted before the value `egg-21`.
`variableTop`:: `variable_top`::
Single character or contraction. Controls what is variable for `alternate`. Single character or contraction. Controls what is variable for `alternate`.
`hiraganaQuaternaryMode`:: `hiragana_quaternary_mode`::
Possible values: `true` or `false`. Distinguishing between Katakana and Possible values: `true` or `false`. Distinguishing between Katakana and
Hiragana characters in `quaternary` strength. Hiragana characters in `quaternary` strength.

View File

@ -0,0 +1,746 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RawCollationKey;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.index.analysis.IndexableBinaryStringTools;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.plain.DocValuesIndexFieldData;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.search.DocValueFormat;
import org.joda.time.DateTimeZone;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.LongSupplier;
public class ICUCollationKeywordFieldMapper extends FieldMapper {
public static final String CONTENT_TYPE = "icu_collation_keyword";
public static class Defaults {
public static final MappedFieldType FIELD_TYPE = new CollationFieldType();
static {
FIELD_TYPE.setTokenized(false);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS);
FIELD_TYPE.freeze();
}
public static final String NULL_VALUE = null;
}
public static final class CollationFieldType extends StringFieldType {
private Collator collator = null;
public CollationFieldType() {
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
setSearchAnalyzer(Lucene.KEYWORD_ANALYZER);
}
protected CollationFieldType(CollationFieldType ref) {
super(ref);
this.collator = ref.collator;
}
public CollationFieldType clone() {
return new CollationFieldType(this);
}
@Override
public boolean equals(Object o) {
return super.equals(o) && Objects.equals(collator, ((CollationFieldType) o).collator);
}
@Override
public void checkCompatibility(MappedFieldType otherFT, List<String> conflicts, boolean strict) {
super.checkCompatibility(otherFT, conflicts, strict);
CollationFieldType other = (CollationFieldType) otherFT;
if (!Objects.equals(collator, other.collator)) {
conflicts.add("mapper [" + name() + "] has different [collator]");
}
}
@Override
public int hashCode() {
return 31 * super.hashCode() + Objects.hashCode(collator);
}
@Override
public String typeName() {
return CONTENT_TYPE;
}
public Collator collator() {
return collator;
}
public void setCollator(Collator collator) {
checkIfFrozen();
this.collator = collator.isFrozen() ? collator : collator.freeze();
}
@Override
public Query nullValueQuery() {
if (nullValue() == null) {
return null;
}
return termQuery(nullValue(), null);
}
@Override
public IndexFieldData.Builder fielddataBuilder() {
failIfNoDocValues();
return new DocValuesIndexFieldData.Builder();
}
@Override
protected BytesRef indexedValueForSearch(Object value) {
if (value == null) {
return null;
}
if (value instanceof BytesRef) {
value = ((BytesRef) value).utf8ToString();
}
if (collator != null) {
RawCollationKey key = collator.getRawCollationKey(value.toString(), null);
return new BytesRef(key.bytes, 0, key.size);
} else {
throw new IllegalStateException("collator is null");
}
}
@Override
public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int maxExpansions,
boolean transpositions) {
throw new UnsupportedOperationException();
}
@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
throw new UnsupportedOperationException();
}
@Override
public Query regexpQuery(String value, int flags, int maxDeterminizedStates,
MultiTermQuery.RewriteMethod method, QueryShardContext context) {
throw new UnsupportedOperationException();
}
public static DocValueFormat COLLATE_FORMAT = new DocValueFormat() {
@Override
public String getWriteableName() {
return "collate";
}
@Override
public void writeTo(StreamOutput out) throws IOException {
}
@Override
public String format(long value) {
throw new UnsupportedOperationException();
}
@Override
public String format(double value) {
throw new UnsupportedOperationException();
}
@Override
public String format(BytesRef value) {
int encodedLength = IndexableBinaryStringTools.getEncodedLength(value.bytes, value.offset, value.length);
char[] encoded = new char[encodedLength];
IndexableBinaryStringTools.encode(value.bytes, value.offset, value.length, encoded, 0, encodedLength);
return new String(encoded, 0, encodedLength);
}
@Override
public long parseLong(String value, boolean roundUp, LongSupplier now) {
throw new UnsupportedOperationException();
}
@Override
public double parseDouble(String value, boolean roundUp, LongSupplier now) {
throw new UnsupportedOperationException();
}
@Override
public BytesRef parseBytesRef(String value) {
char[] encoded = value.toCharArray();
int decodedLength = IndexableBinaryStringTools.getDecodedLength(encoded, 0, encoded.length);
byte[] decoded = new byte[decodedLength];
IndexableBinaryStringTools.decode(encoded, 0, encoded.length, decoded, 0, decodedLength);
return new BytesRef(decoded);
}
};
@Override
public DocValueFormat docValueFormat(final String format, final DateTimeZone timeZone) {
return COLLATE_FORMAT;
}
}
public static class Builder extends FieldMapper.Builder<Builder, ICUCollationKeywordFieldMapper> {
private String rules = null;
private String language = null;
private String country = null;
private String variant = null;
private String strength = null;
private String decomposition = null;
private String alternate = null;
private boolean caseLevel = false;
private String caseFirst = null;
private boolean numeric = false;
private String variableTop = null;
private boolean hiraganaQuaternaryMode = false;
private String nullValue = Defaults.NULL_VALUE;
public Builder(String name) {
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
builder = this;
}
@Override
public CollationFieldType fieldType() {
return (CollationFieldType) super.fieldType();
}
@Override
public Builder indexOptions(IndexOptions indexOptions) {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) > 0) {
throw new IllegalArgumentException("The [" + CONTENT_TYPE + "] field does not support positions, got [index_options]="
+ indexOptionToString(indexOptions));
}
return super.indexOptions(indexOptions);
}
public String rules() {
return rules;
}
public Builder rules(final String rules) {
this.rules = rules;
return this;
}
public String language() {
return language;
}
public Builder language(final String language) {
this.language = language;
return this;
}
public String country() {
return country;
}
public Builder country(final String country) {
this.country = country;
return this;
}
public String variant() {
return variant;
}
public Builder variant(final String variant) {
this.variant = variant;
return this;
}
public String strength() {
return strength;
}
public Builder strength(final String strength) {
this.strength = strength;
return this;
}
public String decomposition() {
return decomposition;
}
public Builder decomposition(final String decomposition) {
this.decomposition = decomposition;
return this;
}
public String alternate() {
return alternate;
}
public Builder alternate(final String alternate) {
this.alternate = alternate;
return this;
}
public boolean caseLevel() {
return caseLevel;
}
public Builder caseLevel(final boolean caseLevel) {
this.caseLevel = caseLevel;
return this;
}
public String caseFirst() {
return caseFirst;
}
public Builder caseFirst(final String caseFirst) {
this.caseFirst = caseFirst;
return this;
}
public boolean numeric() {
return numeric;
}
public Builder numeric(final boolean numeric) {
this.numeric = numeric;
return this;
}
public String variableTop() {
return variableTop;
}
public Builder variableTop(final String variableTop) {
this.variableTop = variableTop;
return this;
}
public boolean hiraganaQuaternaryMode() {
return hiraganaQuaternaryMode;
}
public Builder hiraganaQuaternaryMode(final boolean hiraganaQuaternaryMode) {
this.hiraganaQuaternaryMode = hiraganaQuaternaryMode;
return this;
}
public Collator buildCollator() {
Collator collator;
if (rules != null) {
try {
collator = new RuleBasedCollator(rules);
} catch (Exception e) {
throw new IllegalArgumentException("Failed to parse collation rules", e);
}
} else {
if (language != null) {
ULocale locale;
if (country != null) {
if (variant != null) {
locale = new ULocale(language, country, variant);
} else {
locale = new ULocale(language, country);
}
} else {
locale = new ULocale(language);
}
collator = Collator.getInstance(locale);
} else {
collator = Collator.getInstance();
}
}
// set the strength flag, otherwise it will be the default.
if (strength != null) {
if (strength.equalsIgnoreCase("primary")) {
collator.setStrength(Collator.PRIMARY);
} else if (strength.equalsIgnoreCase("secondary")) {
collator.setStrength(Collator.SECONDARY);
} else if (strength.equalsIgnoreCase("tertiary")) {
collator.setStrength(Collator.TERTIARY);
} else if (strength.equalsIgnoreCase("quaternary")) {
collator.setStrength(Collator.QUATERNARY);
} else if (strength.equalsIgnoreCase("identical")) {
collator.setStrength(Collator.IDENTICAL);
} else {
throw new IllegalArgumentException("Invalid strength: " + strength);
}
}
// set the decomposition flag, otherwise it will be the default.
if (decomposition != null) {
if (decomposition.equalsIgnoreCase("no")) {
collator.setDecomposition(Collator.NO_DECOMPOSITION);
} else if (decomposition.equalsIgnoreCase("canonical")) {
collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
} else {
throw new IllegalArgumentException("Invalid decomposition: " + decomposition);
}
}
// expert options: concrete subclasses are always a RuleBasedCollator
RuleBasedCollator rbc = (RuleBasedCollator) collator;
if (alternate != null) {
if (alternate.equalsIgnoreCase("shifted")) {
rbc.setAlternateHandlingShifted(true);
} else if (alternate.equalsIgnoreCase("non-ignorable")) {
rbc.setAlternateHandlingShifted(false);
} else {
throw new IllegalArgumentException("Invalid alternate: " + alternate);
}
}
if (caseLevel) {
rbc.setCaseLevel(true);
}
if (caseFirst != null) {
if (caseFirst.equalsIgnoreCase("lower")) {
rbc.setLowerCaseFirst(true);
} else if (caseFirst.equalsIgnoreCase("upper")) {
rbc.setUpperCaseFirst(true);
} else {
throw new IllegalArgumentException("Invalid caseFirst: " + caseFirst);
}
}
if (numeric) {
rbc.setNumericCollation(true);
}
if (variableTop != null) {
rbc.setVariableTop(variableTop);
}
if (hiraganaQuaternaryMode) {
rbc.setHiraganaQuaternary(true);
}
// freeze so thread-safe
return collator.freeze();
}
@Override
public ICUCollationKeywordFieldMapper build(BuilderContext context) {
final Collator collator = buildCollator();
fieldType().setCollator(collator);
setupFieldType(context);
return new ICUCollationKeywordFieldMapper(name, fieldType, defaultFieldType, context.indexSettings(),
multiFieldsBuilder.build(this, context), copyTo, rules, language, country, variant, strength, decomposition,
alternate, caseLevel, caseFirst, numeric, variableTop, hiraganaQuaternaryMode, collator);
}
}
public static class TypeParser implements Mapper.TypeParser {
@Override
public Mapper.Builder<?, ?> parse(String name, Map<String, Object> node, ParserContext parserContext)
throws MapperParsingException {
Builder builder = new Builder(name);
TypeParsers.parseField(builder, name, node, parserContext);
for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet().iterator(); iterator.hasNext(); ) {
Map.Entry<String, Object> entry = iterator.next();
String fieldName = entry.getKey();
Object fieldNode = entry.getValue();
switch (fieldName) {
case "null_value":
if (fieldNode == null) {
throw new MapperParsingException("Property [null_value] cannot be null.");
}
builder.nullValue(fieldNode.toString());
iterator.remove();
break;
case "norms":
builder.omitNorms(!XContentMapValues.nodeBooleanValue(fieldNode, "norms"));
iterator.remove();
break;
case "rules":
builder.rules(XContentMapValues.nodeStringValue(fieldNode, null));
iterator.remove();
break;
case "language":
builder.language(XContentMapValues.nodeStringValue(fieldNode, null));
iterator.remove();
break;
case "country":
builder.country(XContentMapValues.nodeStringValue(fieldNode, null));
iterator.remove();
break;
case "variant":
builder.variant(XContentMapValues.nodeStringValue(fieldNode, null));
iterator.remove();
break;
case "strength":
builder.strength(XContentMapValues.nodeStringValue(fieldNode, null));
iterator.remove();
break;
case "decomposition":
builder.decomposition(XContentMapValues.nodeStringValue(fieldNode, null));
iterator.remove();
break;
case "alternate":
builder.alternate(XContentMapValues.nodeStringValue(fieldNode, null));
iterator.remove();
break;
case "case_level":
builder.caseLevel(XContentMapValues.nodeBooleanValue(fieldNode, false));
iterator.remove();
break;
case "case_first":
builder.caseFirst(XContentMapValues.nodeStringValue(fieldNode, null));
iterator.remove();
break;
case "numeric":
builder.numeric(XContentMapValues.nodeBooleanValue(fieldNode, false));
iterator.remove();
break;
case "variable_top":
builder.variableTop(XContentMapValues.nodeStringValue(fieldNode, null));
iterator.remove();
break;
case "hiragana_quaternary_mode":
builder.hiraganaQuaternaryMode(XContentMapValues.nodeBooleanValue(fieldNode, false));
iterator.remove();
break;
default:
break;
}
}
return builder;
}
}
private final String rules;
private final String language;
private final String country;
private final String variant;
private final String strength;
private final String decomposition;
private final String alternate;
private final boolean caseLevel;
private final String caseFirst;
private final boolean numeric;
private final String variableTop;
private final boolean hiraganaQuaternaryMode;
private final Collator collator;
protected ICUCollationKeywordFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
Settings indexSettings, MultiFields multiFields, CopyTo copyTo, String rules, String language,
String country, String variant,
String strength, String decomposition, String alternate, boolean caseLevel, String caseFirst,
boolean numeric, String variableTop, boolean hiraganaQuaternaryMode, Collator collator) {
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
assert collator.isFrozen();
this.rules = rules;
this.language = language;
this.country = country;
this.variant = variant;
this.strength = strength;
this.decomposition = decomposition;
this.alternate = alternate;
this.caseLevel = caseLevel;
this.caseFirst = caseFirst;
this.numeric = numeric;
this.variableTop = variableTop;
this.hiraganaQuaternaryMode = hiraganaQuaternaryMode;
this.collator = collator;
}
@Override
public CollationFieldType fieldType() {
return (CollationFieldType) super.fieldType();
}
@Override
protected String contentType() {
return CONTENT_TYPE;
}
@Override
protected void doMerge(Mapper mergeWith, boolean updateAllTypes) {
super.doMerge(mergeWith, updateAllTypes);
List<String> conflicts = new ArrayList<>();
ICUCollationKeywordFieldMapper icuMergeWith = (ICUCollationKeywordFieldMapper) mergeWith;
if (!Objects.equals(rules, icuMergeWith.rules)) {
conflicts.add("Cannot update rules setting for [" + CONTENT_TYPE + "]");
}
if (!Objects.equals(language, icuMergeWith.language)) {
conflicts.add("Cannot update language setting for [" + CONTENT_TYPE + "]");
}
if (!Objects.equals(country, icuMergeWith.country)) {
conflicts.add("Cannot update country setting for [" + CONTENT_TYPE + "]");
}
if (!Objects.equals(variant, icuMergeWith.variant)) {
conflicts.add("Cannot update variant setting for [" + CONTENT_TYPE + "]");
}
if (!Objects.equals(strength, icuMergeWith.strength)) {
conflicts.add("Cannot update strength setting for [" + CONTENT_TYPE + "]");
}
if (!Objects.equals(decomposition, icuMergeWith.decomposition)) {
conflicts.add("Cannot update decomposition setting for [" + CONTENT_TYPE + "]");
}
if (!Objects.equals(alternate, icuMergeWith.alternate)) {
conflicts.add("Cannot update alternate setting for [" + CONTENT_TYPE + "]");
}
if (caseLevel != icuMergeWith.caseLevel) {
conflicts.add("Cannot update case_level setting for [" + CONTENT_TYPE + "]");
}
if (!Objects.equals(caseFirst, icuMergeWith.caseFirst)) {
conflicts.add("Cannot update case_first setting for [" + CONTENT_TYPE + "]");
}
if (numeric != icuMergeWith.numeric) {
conflicts.add("Cannot update numeric setting for [" + CONTENT_TYPE + "]");
}
if (!Objects.equals(variableTop, icuMergeWith.variableTop)) {
conflicts.add("Cannot update variable_top setting for [" + CONTENT_TYPE + "]");
}
if (hiraganaQuaternaryMode != icuMergeWith.hiraganaQuaternaryMode) {
conflicts.add("Cannot update hiragana_quaternary_mode setting for [" + CONTENT_TYPE + "]");
}
if (!conflicts.isEmpty()) {
throw new IllegalArgumentException("Can't merge because of conflicts: " + conflicts);
}
}
@Override
protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
super.doXContentBody(builder, includeDefaults, params);
if (includeDefaults || fieldType().nullValue() != null) {
builder.field("null_value", fieldType().nullValue());
}
if (includeDefaults || rules != null) {
builder.field("rules", rules);
}
if (includeDefaults || language != null) {
builder.field("language", language);
}
if (includeDefaults || country != null) {
builder.field("country", country);
}
if (includeDefaults || variant != null) {
builder.field("variant", variant);
}
if (includeDefaults || strength != null) {
builder.field("strength", strength);
}
if (includeDefaults || decomposition != null) {
builder.field("decomposition", decomposition);
}
if (includeDefaults || alternate != null) {
builder.field("alternate", alternate);
}
if (includeDefaults || caseLevel) {
builder.field("case_level", caseLevel);
}
if (includeDefaults || caseFirst != null) {
builder.field("case_first", caseFirst);
}
if (includeDefaults || numeric) {
builder.field("numeric", numeric);
}
if (includeDefaults || variableTop != null) {
builder.field("variable_top", variableTop);
}
if (includeDefaults || hiraganaQuaternaryMode) {
builder.field("hiragana_quaternary_mode", hiraganaQuaternaryMode);
}
}
@Override
protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException {
final String value;
if (context.externalValueSet()) {
value = context.externalValue().toString();
} else {
XContentParser parser = context.parser();
if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
value = fieldType().nullValueAsString();
} else {
value = parser.textOrNull();
}
}
if (value == null) {
return;
}
RawCollationKey key = collator.getRawCollationKey(value, null);
final BytesRef binaryValue = new BytesRef(key.bytes, 0, key.size);
if (fieldType().indexOptions() != IndexOptions.NONE || fieldType().stored()) {
Field field = new Field(fieldType().name(), binaryValue, fieldType());
fields.add(field);
}
if (fieldType().hasDocValues()) {
fields.add(new SortedDocValuesField(fieldType().name(), binaryValue));
}
}
}

View File

@ -19,6 +19,9 @@
package org.elasticsearch.plugin.analysis.icu; package org.elasticsearch.plugin.analysis.icu;
import static java.util.Collections.singletonMap;
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.IcuCollationTokenFilterFactory; import org.elasticsearch.index.analysis.IcuCollationTokenFilterFactory;
import org.elasticsearch.index.analysis.IcuFoldingTokenFilterFactory; import org.elasticsearch.index.analysis.IcuFoldingTokenFilterFactory;
@ -28,16 +31,20 @@ import org.elasticsearch.index.analysis.IcuTokenizerFactory;
import org.elasticsearch.index.analysis.IcuTransformTokenFilterFactory; import org.elasticsearch.index.analysis.IcuTransformTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.mapper.ICUCollationKeywordFieldMapper;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.MapperPlugin;
import org.elasticsearch.plugins.Plugin; import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.search.DocValueFormat;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import static java.util.Collections.singletonMap; public class AnalysisICUPlugin extends Plugin implements AnalysisPlugin, MapperPlugin {
public class AnalysisICUPlugin extends Plugin implements AnalysisPlugin {
@Override @Override
public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() { public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
return singletonMap("icu_normalizer", IcuNormalizerCharFilterFactory::new); return singletonMap("icu_normalizer", IcuNormalizerCharFilterFactory::new);
@ -57,4 +64,20 @@ public class AnalysisICUPlugin extends Plugin implements AnalysisPlugin {
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() { public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
return singletonMap("icu_tokenizer", IcuTokenizerFactory::new); return singletonMap("icu_tokenizer", IcuTokenizerFactory::new);
} }
@Override
public Map<String, Mapper.TypeParser> getMappers() {
return Collections.singletonMap(ICUCollationKeywordFieldMapper.CONTENT_TYPE, new ICUCollationKeywordFieldMapper.TypeParser());
}
@Override
public List<NamedWriteableRegistry.Entry> getNamedWriteables() {
return Collections.singletonList(
new NamedWriteableRegistry.Entry(
DocValueFormat.class,
ICUCollationKeywordFieldMapper.CollationFieldType.COLLATE_FORMAT.getWriteableName(),
in -> ICUCollationKeywordFieldMapper.CollationFieldType.COLLATE_FORMAT
)
);
}
} }

View File

@ -0,0 +1,145 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RawCollationKey;
import com.ibm.icu.util.ULocale;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.index.mapper.ICUCollationKeywordFieldMapper.CollationFieldType;
import org.elasticsearch.index.mapper.MappedFieldType.Relation;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class CollationFieldTypeTests extends FieldTypeTestCase {
@Override
protected MappedFieldType createDefaultFieldType() {
return new CollationFieldType();
}
public void testIsFieldWithinQuery() throws IOException {
CollationFieldType ft = new CollationFieldType();
// current impl ignores args and shourd always return INTERSECTS
assertEquals(Relation.INTERSECTS, ft.isFieldWithinQuery(null,
RandomStrings.randomAsciiOfLengthBetween(random(), 0, 5),
RandomStrings.randomAsciiOfLengthBetween(random(), 0, 5),
randomBoolean(), randomBoolean(), null, null, null));
}
public void testTermQuery() {
MappedFieldType ft = createDefaultFieldType();
ft.setName("field");
ft.setIndexOptions(IndexOptions.DOCS);
Collator collator = Collator.getInstance(new ULocale("tr"));
collator.setStrength(Collator.PRIMARY);
collator.freeze();
((CollationFieldType) ft).setCollator(collator);
RawCollationKey key = collator.getRawCollationKey("ı will use turkish casıng", null);
BytesRef expected = new BytesRef(key.bytes, 0, key.size);
assertEquals(new TermQuery(new Term("field", expected)), ft.termQuery("I WİLL USE TURKİSH CASING", null));
ft.setIndexOptions(IndexOptions.NONE);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> ft.termQuery("bar", null));
assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
}
public void testTermsQuery() {
MappedFieldType ft = createDefaultFieldType();
ft.setName("field");
ft.setIndexOptions(IndexOptions.DOCS);
Collator collator = Collator.getInstance().freeze();
((CollationFieldType) ft).setCollator(collator);
RawCollationKey fooKey = collator.getRawCollationKey("foo", null);
RawCollationKey barKey = collator.getRawCollationKey("bar", null);
List<BytesRef> terms = new ArrayList<>();
terms.add(new BytesRef(fooKey.bytes, 0, fooKey.size));
terms.add(new BytesRef(barKey.bytes, 0, barKey.size));
assertEquals(new TermInSetQuery("field", terms),
ft.termsQuery(Arrays.asList("foo", "bar"), null));
ft.setIndexOptions(IndexOptions.NONE);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> ft.termsQuery(Arrays.asList("foo", "bar"), null));
assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
}
public void testRegexpQuery() {
MappedFieldType ft = createDefaultFieldType();
ft.setName("field");
ft.setIndexOptions(IndexOptions.DOCS);
expectThrows(UnsupportedOperationException.class,
() -> ft.regexpQuery("foo.*", 0, 10, null, null));
}
public void testFuzzyQuery() {
MappedFieldType ft = createDefaultFieldType();
ft.setName("field");
ft.setIndexOptions(IndexOptions.DOCS);
expectThrows(UnsupportedOperationException.class,
() -> ft.fuzzyQuery("foo", Fuzziness.fromEdits(2), 1, 50, true));
}
public void testPrefixQuery() {
MappedFieldType ft = createDefaultFieldType();
ft.setName("field");
ft.setIndexOptions(IndexOptions.DOCS);
expectThrows(UnsupportedOperationException.class,
() -> ft.prefixQuery("prefix", null, null));
}
public void testRangeQuery() {
MappedFieldType ft = createDefaultFieldType();
ft.setName("field");
ft.setIndexOptions(IndexOptions.DOCS);
Collator collator = Collator.getInstance().freeze();
((CollationFieldType) ft).setCollator(collator);
RawCollationKey aKey = collator.getRawCollationKey("a", null);
RawCollationKey bKey = collator.getRawCollationKey("b", null);
TermRangeQuery expected = new TermRangeQuery("field", new BytesRef(aKey.bytes, 0, aKey.size),
new BytesRef(bKey.bytes, 0, bKey.size), false, false);
assertEquals(expected, ft.rangeQuery("a", "b", false, false, null));
ft.setIndexOptions(IndexOptions.NONE);
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> ft.rangeQuery("a", "b", false, false, null));
assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
}
}

View File

@ -0,0 +1,443 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertOrderedSearchHits;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.sort.SortOrder;
import org.elasticsearch.test.ESIntegTestCase;
import java.util.Collection;
import java.util.Collections;
public class ICUCollationKeywordFieldMapperIT extends ESIntegTestCase {
@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Collections.singletonList(AnalysisICUPlugin.class);
}
/*
* Turkish has some funny casing.
* This test shows how you can solve this kind of thing easily with collation.
* Instead of using LowerCaseFilter, use a turkish collator with primary strength.
* Then things will sort and match correctly.
*/
public void testBasicUsage() throws Exception {
String index = "foo";
String type = "mytype";
String[] equilavent = {"I WİLL USE TURKİSH CASING", "ı will use turkish casıng"};
XContentBuilder builder = jsonBuilder()
.startObject().startObject("properties")
.startObject("collate")
.field("type", "icu_collation_keyword")
.field("language", "tr")
.field("strength", "primary")
.endObject()
.endObject().endObject();
assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder));
// both values should collate to same value
indexRandom(true,
client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON),
client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON)
);
// searching for either of the terms should return both results since they collate to the same value
SearchRequest request = new SearchRequest()
.indices(index)
.types(type)
.source(new SearchSourceBuilder()
.fetchSource(false)
.query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1]))
.sort("collate")
.sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value
);
SearchResponse response = client().search(request).actionGet();
assertNoFailures(response);
assertHitCount(response, 2L);
assertOrderedSearchHits(response, "2", "1");
}
/*
* Test usage of the decomposition option for unicode normalization.
*/
public void testNormalization() throws Exception {
String index = "foo";
String type = "mytype";
String[] equilavent = {"I W\u0049\u0307LL USE TURKİSH CASING", "ı will use turkish casıng"};
XContentBuilder builder = jsonBuilder()
.startObject().startObject("properties")
.startObject("collate")
.field("type", "icu_collation_keyword")
.field("language", "tr")
.field("strength", "primary")
.field("decomposition", "canonical")
.endObject()
.endObject().endObject();
assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder));
indexRandom(true,
client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON),
client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON)
);
// searching for either of the terms should return both results since they collate to the same value
SearchRequest request = new SearchRequest()
.indices(index)
.types(type)
.source(new SearchSourceBuilder()
.fetchSource(false)
.query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1]))
.sort("collate")
.sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value
);
SearchResponse response = client().search(request).actionGet();
assertNoFailures(response);
assertHitCount(response, 2L);
assertOrderedSearchHits(response, "2", "1");
}
/*
* Test secondary strength, for english case is not significant.
*/
public void testSecondaryStrength() throws Exception {
String index = "foo";
String type = "mytype";
String[] equilavent = {"TESTING", "testing"};
XContentBuilder builder = jsonBuilder()
.startObject().startObject("properties")
.startObject("collate")
.field("type", "icu_collation_keyword")
.field("language", "en")
.field("strength", "secondary")
.field("decomposition", "no")
.endObject()
.endObject().endObject();
assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder));
indexRandom(true,
client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON),
client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON)
);
SearchRequest request = new SearchRequest()
.indices(index)
.types(type)
.source(new SearchSourceBuilder()
.fetchSource(false)
.query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1]))
.sort("collate")
.sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value
);
SearchResponse response = client().search(request).actionGet();
assertNoFailures(response);
assertHitCount(response, 2L);
assertOrderedSearchHits(response, "2", "1");
}
/*
* Setting alternate=shifted to shift whitespace, punctuation and symbols
* to quaternary level
*/
public void testIgnorePunctuation() throws Exception {
String index = "foo";
String type = "mytype";
String[] equilavent = {"foo-bar", "foo bar"};
XContentBuilder builder = jsonBuilder()
.startObject().startObject("properties")
.startObject("collate")
.field("type", "icu_collation_keyword")
.field("language", "en")
.field("strength", "primary")
.field("alternate", "shifted")
.endObject()
.endObject().endObject();
assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder));
indexRandom(true,
client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON),
client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON)
);
SearchRequest request = new SearchRequest()
.indices(index)
.types(type)
.source(new SearchSourceBuilder()
.fetchSource(false)
.query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1]))
.sort("collate")
.sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value
);
SearchResponse response = client().search(request).actionGet();
assertNoFailures(response);
assertHitCount(response, 2L);
assertOrderedSearchHits(response, "2", "1");
}
/*
* Setting alternate=shifted and variableTop to shift whitespace, but not
* punctuation or symbols, to quaternary level
*/
public void testIgnoreWhitespace() throws Exception {
String index = "foo";
String type = "mytype";
XContentBuilder builder = jsonBuilder()
.startObject().startObject("properties")
.startObject("collate")
.field("type", "icu_collation_keyword")
.field("language", "en")
.field("strength", "primary")
.field("alternate", "shifted")
.field("variable_top", " ")
.field("index", false)
.endObject()
.endObject().endObject();
assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder));
indexRandom(true,
client().prepareIndex(index, type, "1").setSource("{\"collate\":\"foo bar\"}", XContentType.JSON),
client().prepareIndex(index, type, "2").setSource("{\"collate\":\"foobar\"}", XContentType.JSON),
client().prepareIndex(index, type, "3").setSource("{\"collate\":\"foo-bar\"}", XContentType.JSON)
);
SearchRequest request = new SearchRequest()
.indices(index)
.types(type)
.source(new SearchSourceBuilder()
.fetchSource(false)
.sort("collate", SortOrder.ASC)
.sort("_uid", SortOrder.ASC) // secondary sort should kick in on docs 1 and 3 because same value collate value
);
SearchResponse response = client().search(request).actionGet();
assertNoFailures(response);
assertHitCount(response, 3L);
assertOrderedSearchHits(response, "3", "1", "2");
}
/*
* Setting numeric to encode digits with numeric value, so that
* foobar-9 sorts before foobar-10
*/
public void testNumerics() throws Exception {
String index = "foo";
String type = "mytype";
XContentBuilder builder = jsonBuilder()
.startObject().startObject("properties")
.startObject("collate")
.field("type", "icu_collation_keyword")
.field("language", "en")
.field("numeric", true)
.field("index", false)
.endObject()
.endObject().endObject();
assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder));
indexRandom(true,
client().prepareIndex(index, type, "1").setSource("{\"collate\":\"foobar-10\"}", XContentType.JSON),
client().prepareIndex(index, type, "2").setSource("{\"collate\":\"foobar-9\"}", XContentType.JSON)
);
SearchRequest request = new SearchRequest()
.indices(index)
.types(type)
.source(new SearchSourceBuilder()
.fetchSource(false)
.sort("collate", SortOrder.ASC)
);
SearchResponse response = client().search(request).actionGet();
assertNoFailures(response);
assertHitCount(response, 2L);
assertOrderedSearchHits(response, "2", "1");
}
/*
* Setting caseLevel=true to create an additional case level between
* secondary and tertiary
*/
public void testIgnoreAccentsButNotCase() throws Exception {
String index = "foo";
String type = "mytype";
XContentBuilder builder = jsonBuilder()
.startObject().startObject("properties")
.startObject("collate")
.field("type", "icu_collation_keyword")
.field("language", "en")
.field("strength", "primary")
.field("case_level", true)
.field("index", false)
.endObject()
.endObject().endObject();
assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder));
indexRandom(true,
client().prepareIndex(index, type, "1").setSource("{\"collate\":\"résumé\"}", XContentType.JSON),
client().prepareIndex(index, type, "2").setSource("{\"collate\":\"Resume\"}", XContentType.JSON),
client().prepareIndex(index, type, "3").setSource("{\"collate\":\"resume\"}", XContentType.JSON),
client().prepareIndex(index, type, "4").setSource("{\"collate\":\"Résumé\"}", XContentType.JSON)
);
SearchRequest request = new SearchRequest()
.indices(index)
.types(type)
.source(new SearchSourceBuilder()
.fetchSource(false)
.sort("collate", SortOrder.ASC)
.sort("_uid", SortOrder.DESC)
);
SearchResponse response = client().search(request).actionGet();
assertNoFailures(response);
assertHitCount(response, 4L);
assertOrderedSearchHits(response, "3", "1", "4", "2");
}
/*
* Setting caseFirst=upper to cause uppercase strings to sort
* before lowercase ones.
*/
public void testUpperCaseFirst() throws Exception {
String index = "foo";
String type = "mytype";
XContentBuilder builder = jsonBuilder()
.startObject().startObject("properties")
.startObject("collate")
.field("type", "icu_collation_keyword")
.field("language", "en")
.field("strength", "tertiary")
.field("case_first", "upper")
.field("index", false)
.endObject()
.endObject().endObject();
assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder));
indexRandom(true,
client().prepareIndex(index, type, "1").setSource("{\"collate\":\"resume\"}", XContentType.JSON),
client().prepareIndex(index, type, "2").setSource("{\"collate\":\"Resume\"}", XContentType.JSON)
);
SearchRequest request = new SearchRequest()
.indices(index)
.types(type)
.source(new SearchSourceBuilder()
.fetchSource(false)
.sort("collate", SortOrder.ASC)
);
SearchResponse response = client().search(request).actionGet();
assertNoFailures(response);
assertHitCount(response, 2L);
assertOrderedSearchHits(response, "2", "1");
}
/*
* For german, you might want oe to sort and match with o umlaut.
* This is not the default, but you can make a customized ruleset to do this.
*
* The default is DIN 5007-1, this shows how to tailor a collator to get DIN 5007-2 behavior.
* http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4423383
*/
public void testCustomRules() throws Exception {
String index = "foo";
String type = "mytype";
RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
String DIN5007_2_tailorings =
"& ae , a\u0308 & AE , A\u0308" +
"& oe , o\u0308 & OE , O\u0308" +
"& ue , u\u0308 & UE , u\u0308";
RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
String[] equilavent = {"Töne", "Toene"};
XContentBuilder builder = jsonBuilder()
.startObject().startObject("properties")
.startObject("collate")
.field("type", "icu_collation_keyword")
.field("rules", tailoredRules)
.field("strength", "primary")
.endObject()
.endObject().endObject();
assertAcked(client().admin().indices().prepareCreate(index).addMapping(type, builder));
indexRandom(true,
client().prepareIndex(index, type, "1").setSource("{\"collate\":\"" + equilavent[0] + "\"}", XContentType.JSON),
client().prepareIndex(index, type, "2").setSource("{\"collate\":\"" + equilavent[1] + "\"}", XContentType.JSON)
);
SearchRequest request = new SearchRequest()
.indices(index)
.types(type)
.source(new SearchSourceBuilder()
.fetchSource(false)
.query(QueryBuilders.termQuery("collate", randomBoolean() ? equilavent[0] : equilavent[1]))
.sort("collate", SortOrder.ASC)
.sort("_uid", SortOrder.DESC) // secondary sort should kick in because both will collate to same value
);
SearchResponse response = client().search(request).actionGet();
assertNoFailures(response);
assertHitCount(response, 2L);
assertOrderedSearchHits(response, "2", "1");
}
}

View File

@ -0,0 +1,342 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper;
import static org.hamcrest.Matchers.equalTo;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RawCollationKey;
import com.ibm.icu.util.ULocale;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.mapper.MapperService.MergeReason;
import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.elasticsearch.test.InternalSettingsPlugin;
import org.junit.Before;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
public class ICUCollationKeywordFieldMapperTests extends ESSingleNodeTestCase {
private static final String FIELD_TYPE = "icu_collation_keyword";
@Override
protected Collection<Class<? extends Plugin>> getPlugins() {
return Arrays.asList(AnalysisICUPlugin.class, InternalSettingsPlugin.class);
}
IndexService indexService;
DocumentMapperParser parser;
@Before
public void setup() {
indexService = createIndex("test");
parser = indexService.mapperService().documentMapperParser();
}
public void testDefaults() throws Exception {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", FIELD_TYPE).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", "1234")
.endObject()
.bytes(),
XContentType.JSON));
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
Collator collator = Collator.getInstance();
RawCollationKey key = collator.getRawCollationKey("1234", null);
BytesRef expected = new BytesRef(key.bytes, 0, key.size);
assertEquals(expected, fields[0].binaryValue());
IndexableFieldType fieldType = fields[0].fieldType();
assertThat(fieldType.omitNorms(), equalTo(true));
assertFalse(fieldType.tokenized());
assertFalse(fieldType.stored());
assertThat(fieldType.indexOptions(), equalTo(IndexOptions.DOCS));
assertThat(fieldType.storeTermVectors(), equalTo(false));
assertThat(fieldType.storeTermVectorOffsets(), equalTo(false));
assertThat(fieldType.storeTermVectorPositions(), equalTo(false));
assertThat(fieldType.storeTermVectorPayloads(), equalTo(false));
assertEquals(DocValuesType.NONE, fieldType.docValuesType());
assertEquals(expected, fields[1].binaryValue());
fieldType = fields[1].fieldType();
assertThat(fieldType.indexOptions(), equalTo(IndexOptions.NONE));
assertEquals(DocValuesType.SORTED, fieldType.docValuesType());
}
public void testNullValue() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", FIELD_TYPE).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.nullField("field")
.endObject()
.bytes(),
XContentType.JSON));
assertArrayEquals(new IndexableField[0], doc.rootDoc().getFields("field"));
mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", FIELD_TYPE)
.field("null_value", "1234").endObject().endObject()
.endObject().endObject().string();
mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.endObject()
.bytes(),
XContentType.JSON));
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(0, fields.length);
doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.nullField("field")
.endObject()
.bytes(),
XContentType.JSON));
Collator collator = Collator.getInstance();
RawCollationKey key = collator.getRawCollationKey("1234", null);
BytesRef expected = new BytesRef(key.bytes, 0, key.size);
fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
assertEquals(expected, fields[0].binaryValue());
}
public void testEnableStore() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", FIELD_TYPE)
.field("store", true).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", "1234")
.endObject()
.bytes(),
XContentType.JSON));
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
assertTrue(fields[0].fieldType().stored());
}
public void testDisableIndex() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", FIELD_TYPE)
.field("index", false).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", "1234")
.endObject()
.bytes(),
XContentType.JSON));
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(1, fields.length);
assertEquals(IndexOptions.NONE, fields[0].fieldType().indexOptions());
assertEquals(DocValuesType.SORTED, fields[0].fieldType().docValuesType());
}
public void testDisableDocValues() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", FIELD_TYPE)
.field("doc_values", false).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", "1234")
.endObject()
.bytes(),
XContentType.JSON));
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(1, fields.length);
assertEquals(DocValuesType.NONE, fields[0].fieldType().docValuesType());
}
public void testIndexOptions() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", FIELD_TYPE)
.field("index_options", "freqs").endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", "1234")
.endObject()
.bytes(),
XContentType.JSON));
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
assertEquals(IndexOptions.DOCS_AND_FREQS, fields[0].fieldType().indexOptions());
for (String indexOptions : Arrays.asList("positions", "offsets")) {
final String mapping2 = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", FIELD_TYPE)
.field("index_options", indexOptions).endObject().endObject()
.endObject().endObject().string();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> parser.parse("type", new CompressedXContent(mapping2)));
assertEquals("The [" + FIELD_TYPE + "] field does not support positions, got [index_options]=" + indexOptions,
e.getMessage());
}
}
public void testEnableNorms() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field").field("type", FIELD_TYPE)
.field("norms", true).endObject().endObject()
.endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", "1234")
.endObject()
.bytes(),
XContentType.JSON));
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
assertFalse(fields[0].fieldType().omitNorms());
}
public void testCollator() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field")
.field("type", FIELD_TYPE)
.field("language", "tr")
.field("strength", "primary")
.endObject().endObject().endObject().endObject().string();
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", XContentFactory.jsonBuilder()
.startObject()
.field("field", "I WİLL USE TURKİSH CASING")
.endObject()
.bytes(),
XContentType.JSON));
Collator collator = Collator.getInstance(new ULocale("tr"));
collator.setStrength(Collator.PRIMARY);
RawCollationKey key = collator.getRawCollationKey("ı will use turkish casıng", null); // should collate to same value
BytesRef expected = new BytesRef(key.bytes, 0, key.size);
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
assertEquals(expected, fields[0].binaryValue());
IndexableFieldType fieldType = fields[0].fieldType();
assertThat(fieldType.omitNorms(), equalTo(true));
assertFalse(fieldType.tokenized());
assertFalse(fieldType.stored());
assertThat(fieldType.indexOptions(), equalTo(IndexOptions.DOCS));
assertThat(fieldType.storeTermVectors(), equalTo(false));
assertThat(fieldType.storeTermVectorOffsets(), equalTo(false));
assertThat(fieldType.storeTermVectorPositions(), equalTo(false));
assertThat(fieldType.storeTermVectorPayloads(), equalTo(false));
assertEquals(DocValuesType.NONE, fieldType.docValuesType());
assertEquals(expected, fields[1].binaryValue());
fieldType = fields[1].fieldType();
assertThat(fieldType.indexOptions(), equalTo(IndexOptions.NONE));
assertEquals(DocValuesType.SORTED, fieldType.docValuesType());
}
public void testUpdateCollator() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field")
.field("type", FIELD_TYPE)
.field("language", "tr")
.field("strength", "primary")
.endObject().endObject().endObject().endObject().string();
indexService.mapperService().merge("type", new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE, randomBoolean());
String mapping2 = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field")
.field("type", FIELD_TYPE)
.field("language", "en")
.endObject().endObject().endObject().endObject().string();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> indexService.mapperService().merge("type",
new CompressedXContent(mapping2), MergeReason.MAPPING_UPDATE, randomBoolean()));
assertEquals("Can't merge because of conflicts: [Cannot update language setting for [" + FIELD_TYPE
+ "], Cannot update strength setting for [" + FIELD_TYPE + "]]", e.getMessage());
}
}

View File

@ -175,13 +175,15 @@ public abstract class FieldTypeTestCase extends ESTestCase {
// TODO: remove this once toString is no longer final on FieldType... // TODO: remove this once toString is no longer final on FieldType...
protected void assertFieldTypeEquals(String property, MappedFieldType ft1, MappedFieldType ft2) { protected void assertFieldTypeEquals(String property, MappedFieldType ft1, MappedFieldType ft2) {
if (ft1.equals(ft2) == false) { if (ft1.equals(ft2) == false) {
fail("Expected equality, testing property " + property + "\nexpected: " + toString(ft1) + "; \nactual: " + toString(ft2) + "\n"); fail("Expected equality, testing property " + property + "\nexpected: " + toString(ft1) + "; \nactual: " + toString(ft2)
+ "\n");
} }
} }
protected void assertFieldTypeNotEquals(String property, MappedFieldType ft1, MappedFieldType ft2) { protected void assertFieldTypeNotEquals(String property, MappedFieldType ft1, MappedFieldType ft2) {
if (ft1.equals(ft2)) { if (ft1.equals(ft2)) {
fail("Expected inequality, testing property " + property + "\nfirst: " + toString(ft1) + "; \nsecond: " + toString(ft2) + "\n"); fail("Expected inequality, testing property " + property + "\nfirst: " + toString(ft1) + "; \nsecond: " + toString(ft2)
+ "\n");
} }
} }