Wildcard field - add normalizer support (#53851) (#54109)

Backport support for normalisation to wildcard field

Closes #53603
This commit is contained in:
markharwood 2020-03-24 17:37:47 +00:00 committed by GitHub
parent c141c1dd89
commit 6a60f85bba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 204 additions and 43 deletions

View File

@ -50,6 +50,23 @@ POST my_index/_doc/_search
--------------------------------------------------
[[wildcard-params]]
==== Parameters for wildcard fields
The following parameters are accepted by `wildcard` fields:
[horizontal]
<<ignore-above,`ignore_above`>>::
Do not index any string longer than this value. Defaults to `2147483647`
so that all values would be accepted.
<<normalizer,`normalizer`>>::
How to pre-process the value prior to indexing. Defaults to `null`,
meaning the value is kept as-is.
==== Limitations
* `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries.

View File

@ -358,14 +358,14 @@ public abstract class MappedFieldType extends FieldType {
}
public Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) {
throw new QueryShardException(context, "Can only use prefix queries on keyword and text fields - not on [" + name
throw new QueryShardException(context, "Can only use prefix queries on keyword, text and wildcard fields - not on [" + name
+ "] which is of type [" + typeName() + "]");
}
public Query wildcardQuery(String value,
@Nullable MultiTermQuery.RewriteMethod method,
QueryShardContext context) {
throw new QueryShardException(context, "Can only use wildcard queries on keyword and text fields - not on [" + name
throw new QueryShardException(context, "Can only use wildcard queries on keyword, text and wildcard fields - not on [" + name
+ "] which is of type [" + typeName() + "]");
}

View File

@ -19,6 +19,7 @@
package org.elasticsearch.index.mapper;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MultiTermQuery;
@ -93,6 +94,36 @@ public abstract class StringFieldType extends TermBasedFieldType {
return query;
}
public static final String normalizeWildcardPattern(String fieldname, String value, Analyzer normalizer) {
if (normalizer == null) {
return value;
}
// we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
// is a char_filter that would otherwise remove them
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
BytesRefBuilder sb = new BytesRefBuilder();
int last = 0;
while (wildcardMatcher.find()) {
if (wildcardMatcher.start() > 0) {
String chunk = value.substring(last, wildcardMatcher.start());
BytesRef normalized = normalizer.normalize(fieldname, chunk);
sb.append(normalized);
}
// append the matched group - without normalizing
sb.append(new BytesRef(wildcardMatcher.group()));
last = wildcardMatcher.end();
}
if (last < value.length()) {
String chunk = value.substring(last);
BytesRef normalized = normalizer.normalize(fieldname, chunk);
sb.append(normalized);
}
return sb.toBytesRef().utf8ToString();
}
@Override
public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
failIfNotIndexed();
@ -103,30 +134,8 @@ public abstract class StringFieldType extends TermBasedFieldType {
Term term;
if (searchAnalyzer() != null) {
// we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
// is a char_filter that would otherwise remove them
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
BytesRefBuilder sb = new BytesRefBuilder();
int last = 0;
while (wildcardMatcher.find()) {
if (wildcardMatcher.start() > 0) {
String chunk = value.substring(last, wildcardMatcher.start());
BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
sb.append(normalized);
}
// append the matched group - without normalizing
sb.append(new BytesRef(wildcardMatcher.group()));
last = wildcardMatcher.end();
}
if (last < value.length()) {
String chunk = value.substring(last);
BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
sb.append(normalized);
}
term = new Term(name(), sb.toBytesRef());
value = normalizeWildcardPattern(name(), value, searchAnalyzer());
term = new Term(name(), value);
} else {
term = new Term(name(), indexedValueForSearch(value));
}

View File

@ -264,7 +264,7 @@ public final class QueryBuilders {
* which matches any single character. Note this query can be slow, as it
* needs to iterate over many terms. In order to prevent extremely slow WildcardQueries,
* a Wildcard term should not start with one of the wildcards {@code *} or
* {@code ?}.
* {@code ?}. (The wildcard field type however, is optimised for leading wildcards)
*
* @param name The field name
* @param query The wildcard query string

View File

@ -116,7 +116,7 @@ public class PrefixQueryBuilderTests extends AbstractQueryTestCase<PrefixQueryBu
QueryShardContext context = createShardContext();
QueryShardException e = expectThrows(QueryShardException.class,
() -> query.toQuery(context));
assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
e.getMessage());
}

View File

@ -816,7 +816,7 @@ public class QueryStringQueryBuilderTests extends AbstractQueryTestCase<QueryStr
QueryShardContext context = createShardContext();
QueryShardException e = expectThrows(QueryShardException.class,
() -> query.toQuery(context));
assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
e.getMessage());
query.lenient(true);
query.toQuery(context); // no exception

View File

@ -10,10 +10,20 @@ setup:
body:
settings:
number_of_replicas: 0
analysis:
normalizer:
lowercase:
type: custom
char_filter: []
filter: ["lowercase"]
mappings:
properties:
my_wildcard:
type: wildcard
normalizer: lowercase
fields:
case_sensitive:
type: wildcard
- do:
index:
index: test-index
@ -26,6 +36,12 @@ setup:
id: 2
body:
my_wildcard: goodbye world
- do:
index:
index: test-index
id: 3
body:
my_wildcard: cAsE iNsEnSiTiVe World
- do:
indices.refresh: {}
@ -80,6 +96,31 @@ setup:
my_wildcard: {value: "*ello worl*" }
- match: {hits.total.value: 1}
---
"Case insensitive query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard: {value: "*Worl*" }
- match: {hits.total.value: 3}
---
"Case sensitive query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard.case_sensitive: {value: "*Worl*" }
- match: {hits.total.value: 1}
---
@ -93,7 +134,7 @@ setup:
my_wildcard: {value: "*ld" }
- match: {hits.total.value: 2}
- match: {hits.total.value: 3}
---
"Long suffix query":
@ -188,8 +229,8 @@ setup:
terms: {field: "my_wildcard" }
- match: {hits.total.value: 2}
- length: { aggregations.top_vals.buckets: 2 }
- match: {hits.total.value: 3}
- length: { aggregations.top_vals.buckets: 3 }
---
"Sort works":
@ -199,10 +240,11 @@ setup:
track_total_hits: true
sort: [ { "my_wildcard": "desc" } ]
- match: { hits.total.value: 2 }
- length: { hits.hits: 2 }
- match: { hits.total.value: 3 }
- length: { hits.hits: 3 }
- match: { hits.hits.0._id: "1" }
- match: { hits.hits.1._id: "2" }
- match: { hits.hits.2._id: "3" }
- do:
search:
@ -210,9 +252,9 @@ setup:
track_total_hits: true
sort: [ { "my_wildcard": "asc" } ]
- match: { hits.total.value: 2 }
- length: { hits.hits: 2 }
- match: { hits.hits.0._id: "2" }
- match: { hits.hits.1._id: "1" }
- match: { hits.total.value: 3 }
- length: { hits.hits: 3 }
- match: { hits.hits.0._id: "3" }
- match: { hits.hits.1._id: "2" }
- match: { hits.hits.2._id: "1" }

View File

@ -39,6 +39,7 @@ import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AnalyzerScope;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
@ -53,6 +54,7 @@ import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.ParseContext.Document;
import org.elasticsearch.index.mapper.StringFieldType;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.similarity.SimilarityProvider;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
@ -64,6 +66,7 @@ import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import static org.elasticsearch.index.mapper.TypeParsers.parseField;
@ -100,6 +103,9 @@ public class WildcardFieldMapper extends FieldMapper {
public static class Builder extends FieldMapper.Builder<Builder, WildcardFieldMapper> {
protected int ignoreAbove = Defaults.IGNORE_ABOVE;
private IndexAnalyzers indexAnalyzers;
private String normalizerName;
public Builder(String name) {
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
@ -164,10 +170,23 @@ public class WildcardFieldMapper extends FieldMapper {
public WildcardFieldType fieldType() {
return (WildcardFieldType) super.fieldType();
}
public Builder normalizer(IndexAnalyzers indexAnalyzers, String name) {
this.indexAnalyzers = indexAnalyzers;
this.normalizerName = name;
return builder;
}
@Override
public WildcardFieldMapper build(BuilderContext context) {
setupFieldType(context);
setupFieldType(context);
if (normalizerName != null) {
NamedAnalyzer normalizer = indexAnalyzers.getNormalizer(normalizerName);
if (normalizer == null) {
throw new MapperParsingException("normalizer [" + normalizerName + "] not found for field [" + name + "]");
}
fieldType().setNormalizer(normalizer);
}
return new WildcardFieldMapper(
name, fieldType, defaultFieldType, ignoreAbove,
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
@ -188,6 +207,11 @@ public class WildcardFieldMapper extends FieldMapper {
if (propName.equals("ignore_above")) {
builder.ignoreAbove(XContentMapValues.nodeIntegerValue(propNode, -1));
iterator.remove();
} else if (propName.equals("normalizer")) {
if (propNode != null) {
builder.normalizer(parserContext.getIndexAnalyzers(), propNode.toString());
}
iterator.remove();
}
}
@ -198,6 +222,8 @@ public class WildcardFieldMapper extends FieldMapper {
public static final char TOKEN_START_OR_END_CHAR = 0;
public static final class WildcardFieldType extends MappedFieldType {
private NamedAnalyzer normalizer = null;
public WildcardFieldType() {
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
@ -206,6 +232,7 @@ public class WildcardFieldMapper extends FieldMapper {
protected WildcardFieldType(WildcardFieldType ref) {
super(ref);
this.normalizer = ref.normalizer;
}
public WildcardFieldType clone() {
@ -213,7 +240,39 @@ public class WildcardFieldMapper extends FieldMapper {
return result;
}
@Override
public boolean equals(Object o) {
if (super.equals(o) == false) {
return false;
}
WildcardFieldType other = (WildcardFieldType) o;
return Objects.equals(normalizer, other.normalizer);
}
@Override
public int hashCode() {
return 31 * super.hashCode() + Objects.hash(normalizer);
}
private NamedAnalyzer normalizer() {
return normalizer;
}
public void setNormalizer(NamedAnalyzer normalizer) {
checkIfFrozen();
this.normalizer = normalizer;
}
@Override
public void checkCompatibility(MappedFieldType otherFT, List<String> conflicts) {
super.checkCompatibility(otherFT, conflicts);
WildcardFieldType other = (WildcardFieldType) otherFT;
if (Objects.equals(normalizer, other.normalizer) == false) {
conflicts.add("mapper [" + name() + "] has different [normalizer]");
}
}
// Holds parsed information about the wildcard pattern
static class PatternStructure {
boolean openStart, openEnd, hasSymbols;
@ -327,6 +386,9 @@ public class WildcardFieldMapper extends FieldMapper {
@Override
public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) {
if (normalizer != null) {
wildcardPattern = StringFieldType.normalizeWildcardPattern(name(), wildcardPattern, normalizer);
}
PatternStructure patternStructure = new PatternStructure(wildcardPattern);
ArrayList<String> tokens = new ArrayList<>();
@ -467,7 +529,32 @@ public class WildcardFieldMapper extends FieldMapper {
CircuitBreakerService breakerService, MapperService mapperService) {
return new WildcardBytesBinaryDVIndexFieldData(indexSettings.getIndex(), fieldType.name());
}};
}
}
String normalize(String value) throws IOException {
if (normalizer != null) {
try (TokenStream ts = normalizer.tokenStream(name(), value)) {
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
if (ts.incrementToken() == false) {
throw new IllegalStateException("The normalization token stream is "
+ "expected to produce exactly 1 token, but got 0 for analyzer "
+ normalizer + " and input \"" + value + "\"");
}
final String newValue = termAtt.toString();
if (ts.incrementToken()) {
throw new IllegalStateException("The normalization token stream is "
+ "expected to produce exactly 1 token, but got 2+ for analyzer "
+ normalizer + " and input \"" + value + "\"");
}
ts.end();
return newValue;
}
}
return value;
}
}
static class WildcardBytesBinaryDVIndexFieldData extends BytesBinaryDVIndexFieldData{
@ -521,6 +608,11 @@ public class WildcardFieldMapper extends FieldMapper {
if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) {
builder.field("ignore_above", ignoreAbove);
}
if (fieldType().normalizer() != null) {
builder.field("normalizer", fieldType().normalizer().name());
} else if (includeDefaults) {
builder.nullField("normalizer");
}
}
@Override
@ -544,10 +636,11 @@ public class WildcardFieldMapper extends FieldMapper {
// For internal use by Lucene only - used to define ngram index
final MappedFieldType ngramFieldType;
void createFields(String value, Document parseDoc, List<IndexableField>fields) {
void createFields(String value, Document parseDoc, List<IndexableField>fields) throws IOException {
if (value == null || value.length() > ignoreAbove) {
return;
}
value = fieldType().normalize(value);
String ngramValue = TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType);
fields.add(ngramField);