Backport support for normalisation to wildcard field Closes #53603
This commit is contained in:
parent
c141c1dd89
commit
6a60f85bba
|
@ -50,6 +50,23 @@ POST my_index/_doc/_search
|
|||
--------------------------------------------------
|
||||
|
||||
|
||||
[[wildcard-params]]
|
||||
==== Parameters for wildcard fields
|
||||
|
||||
The following parameters are accepted by `wildcard` fields:
|
||||
|
||||
[horizontal]
|
||||
|
||||
<<ignore-above,`ignore_above`>>::
|
||||
|
||||
Do not index any string longer than this value. Defaults to `2147483647`
|
||||
so that all values would be accepted.
|
||||
|
||||
<<normalizer,`normalizer`>>::
|
||||
|
||||
How to pre-process the value prior to indexing. Defaults to `null`,
|
||||
meaning the value is kept as-is.
|
||||
|
||||
==== Limitations
|
||||
|
||||
* `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries.
|
||||
|
|
|
@ -358,14 +358,14 @@ public abstract class MappedFieldType extends FieldType {
|
|||
}
|
||||
|
||||
public Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) {
|
||||
throw new QueryShardException(context, "Can only use prefix queries on keyword and text fields - not on [" + name
|
||||
throw new QueryShardException(context, "Can only use prefix queries on keyword, text and wildcard fields - not on [" + name
|
||||
+ "] which is of type [" + typeName() + "]");
|
||||
}
|
||||
|
||||
public Query wildcardQuery(String value,
|
||||
@Nullable MultiTermQuery.RewriteMethod method,
|
||||
QueryShardContext context) {
|
||||
throw new QueryShardException(context, "Can only use wildcard queries on keyword and text fields - not on [" + name
|
||||
throw new QueryShardException(context, "Can only use wildcard queries on keyword, text and wildcard fields - not on [" + name
|
||||
+ "] which is of type [" + typeName() + "]");
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.elasticsearch.index.mapper;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
|
@ -93,6 +94,36 @@ public abstract class StringFieldType extends TermBasedFieldType {
|
|||
return query;
|
||||
}
|
||||
|
||||
public static final String normalizeWildcardPattern(String fieldname, String value, Analyzer normalizer) {
|
||||
if (normalizer == null) {
|
||||
return value;
|
||||
}
|
||||
// we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
|
||||
// is a char_filter that would otherwise remove them
|
||||
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
|
||||
BytesRefBuilder sb = new BytesRefBuilder();
|
||||
int last = 0;
|
||||
|
||||
while (wildcardMatcher.find()) {
|
||||
if (wildcardMatcher.start() > 0) {
|
||||
String chunk = value.substring(last, wildcardMatcher.start());
|
||||
|
||||
BytesRef normalized = normalizer.normalize(fieldname, chunk);
|
||||
sb.append(normalized);
|
||||
}
|
||||
// append the matched group - without normalizing
|
||||
sb.append(new BytesRef(wildcardMatcher.group()));
|
||||
|
||||
last = wildcardMatcher.end();
|
||||
}
|
||||
if (last < value.length()) {
|
||||
String chunk = value.substring(last);
|
||||
BytesRef normalized = normalizer.normalize(fieldname, chunk);
|
||||
sb.append(normalized);
|
||||
}
|
||||
return sb.toBytesRef().utf8ToString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
|
||||
failIfNotIndexed();
|
||||
|
@ -103,30 +134,8 @@ public abstract class StringFieldType extends TermBasedFieldType {
|
|||
|
||||
Term term;
|
||||
if (searchAnalyzer() != null) {
|
||||
// we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
|
||||
// is a char_filter that would otherwise remove them
|
||||
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
|
||||
BytesRefBuilder sb = new BytesRefBuilder();
|
||||
int last = 0;
|
||||
|
||||
while (wildcardMatcher.find()) {
|
||||
if (wildcardMatcher.start() > 0) {
|
||||
String chunk = value.substring(last, wildcardMatcher.start());
|
||||
|
||||
BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
|
||||
sb.append(normalized);
|
||||
}
|
||||
// append the matched group - without normalizing
|
||||
sb.append(new BytesRef(wildcardMatcher.group()));
|
||||
|
||||
last = wildcardMatcher.end();
|
||||
}
|
||||
if (last < value.length()) {
|
||||
String chunk = value.substring(last);
|
||||
BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
|
||||
sb.append(normalized);
|
||||
}
|
||||
term = new Term(name(), sb.toBytesRef());
|
||||
value = normalizeWildcardPattern(name(), value, searchAnalyzer());
|
||||
term = new Term(name(), value);
|
||||
} else {
|
||||
term = new Term(name(), indexedValueForSearch(value));
|
||||
}
|
||||
|
|
|
@ -264,7 +264,7 @@ public final class QueryBuilders {
|
|||
* which matches any single character. Note this query can be slow, as it
|
||||
* needs to iterate over many terms. In order to prevent extremely slow WildcardQueries,
|
||||
* a Wildcard term should not start with one of the wildcards {@code *} or
|
||||
* {@code ?}.
|
||||
* {@code ?}. (The wildcard field type however, is optimised for leading wildcards)
|
||||
*
|
||||
* @param name The field name
|
||||
* @param query The wildcard query string
|
||||
|
|
|
@ -116,7 +116,7 @@ public class PrefixQueryBuilderTests extends AbstractQueryTestCase<PrefixQueryBu
|
|||
QueryShardContext context = createShardContext();
|
||||
QueryShardException e = expectThrows(QueryShardException.class,
|
||||
() -> query.toQuery(context));
|
||||
assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
|
||||
assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
|
||||
e.getMessage());
|
||||
}
|
||||
|
||||
|
|
|
@ -816,7 +816,7 @@ public class QueryStringQueryBuilderTests extends AbstractQueryTestCase<QueryStr
|
|||
QueryShardContext context = createShardContext();
|
||||
QueryShardException e = expectThrows(QueryShardException.class,
|
||||
() -> query.toQuery(context));
|
||||
assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
|
||||
assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
|
||||
e.getMessage());
|
||||
query.lenient(true);
|
||||
query.toQuery(context); // no exception
|
||||
|
|
|
@ -10,10 +10,20 @@ setup:
|
|||
body:
|
||||
settings:
|
||||
number_of_replicas: 0
|
||||
analysis:
|
||||
normalizer:
|
||||
lowercase:
|
||||
type: custom
|
||||
char_filter: []
|
||||
filter: ["lowercase"]
|
||||
mappings:
|
||||
properties:
|
||||
my_wildcard:
|
||||
type: wildcard
|
||||
normalizer: lowercase
|
||||
fields:
|
||||
case_sensitive:
|
||||
type: wildcard
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
|
@ -26,6 +36,12 @@ setup:
|
|||
id: 2
|
||||
body:
|
||||
my_wildcard: goodbye world
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
id: 3
|
||||
body:
|
||||
my_wildcard: cAsE iNsEnSiTiVe World
|
||||
|
||||
- do:
|
||||
indices.refresh: {}
|
||||
|
@ -80,6 +96,31 @@ setup:
|
|||
my_wildcard: {value: "*ello worl*" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 1}
|
||||
---
|
||||
"Case insensitive query":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
wildcard:
|
||||
my_wildcard: {value: "*Worl*" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 3}
|
||||
|
||||
---
|
||||
"Case sensitive query":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
wildcard:
|
||||
my_wildcard.case_sensitive: {value: "*Worl*" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 1}
|
||||
|
||||
---
|
||||
|
@ -93,7 +134,7 @@ setup:
|
|||
my_wildcard: {value: "*ld" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 2}
|
||||
- match: {hits.total.value: 3}
|
||||
|
||||
---
|
||||
"Long suffix query":
|
||||
|
@ -188,8 +229,8 @@ setup:
|
|||
terms: {field: "my_wildcard" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 2}
|
||||
- length: { aggregations.top_vals.buckets: 2 }
|
||||
- match: {hits.total.value: 3}
|
||||
- length: { aggregations.top_vals.buckets: 3 }
|
||||
|
||||
---
|
||||
"Sort works":
|
||||
|
@ -199,10 +240,11 @@ setup:
|
|||
track_total_hits: true
|
||||
sort: [ { "my_wildcard": "desc" } ]
|
||||
|
||||
- match: { hits.total.value: 2 }
|
||||
- length: { hits.hits: 2 }
|
||||
- match: { hits.total.value: 3 }
|
||||
- length: { hits.hits: 3 }
|
||||
- match: { hits.hits.0._id: "1" }
|
||||
- match: { hits.hits.1._id: "2" }
|
||||
- match: { hits.hits.2._id: "3" }
|
||||
|
||||
- do:
|
||||
search:
|
||||
|
@ -210,9 +252,9 @@ setup:
|
|||
track_total_hits: true
|
||||
sort: [ { "my_wildcard": "asc" } ]
|
||||
|
||||
- match: { hits.total.value: 2 }
|
||||
- length: { hits.hits: 2 }
|
||||
- match: { hits.hits.0._id: "2" }
|
||||
- match: { hits.hits.1._id: "1" }
|
||||
|
||||
- match: { hits.total.value: 3 }
|
||||
- length: { hits.hits: 3 }
|
||||
- match: { hits.hits.0._id: "3" }
|
||||
- match: { hits.hits.1._id: "2" }
|
||||
- match: { hits.hits.2._id: "1" }
|
||||
|
||||
|
|
|
@ -39,6 +39,7 @@ import org.elasticsearch.common.xcontent.support.XContentMapValues;
|
|||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AnalyzerScope;
|
||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.fielddata.IndexFieldData;
|
||||
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
|
||||
|
@ -53,6 +54,7 @@ import org.elasticsearch.index.mapper.MapperParsingException;
|
|||
import org.elasticsearch.index.mapper.MapperService;
|
||||
import org.elasticsearch.index.mapper.ParseContext;
|
||||
import org.elasticsearch.index.mapper.ParseContext.Document;
|
||||
import org.elasticsearch.index.mapper.StringFieldType;
|
||||
import org.elasticsearch.index.query.QueryShardContext;
|
||||
import org.elasticsearch.index.similarity.SimilarityProvider;
|
||||
import org.elasticsearch.indices.breaker.CircuitBreakerService;
|
||||
|
@ -64,6 +66,7 @@ import java.util.ArrayList;
|
|||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import static org.elasticsearch.index.mapper.TypeParsers.parseField;
|
||||
|
||||
|
@ -100,6 +103,9 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
|
||||
public static class Builder extends FieldMapper.Builder<Builder, WildcardFieldMapper> {
|
||||
protected int ignoreAbove = Defaults.IGNORE_ABOVE;
|
||||
private IndexAnalyzers indexAnalyzers;
|
||||
private String normalizerName;
|
||||
|
||||
|
||||
public Builder(String name) {
|
||||
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
|
||||
|
@ -164,10 +170,23 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
public WildcardFieldType fieldType() {
|
||||
return (WildcardFieldType) super.fieldType();
|
||||
}
|
||||
|
||||
public Builder normalizer(IndexAnalyzers indexAnalyzers, String name) {
|
||||
this.indexAnalyzers = indexAnalyzers;
|
||||
this.normalizerName = name;
|
||||
return builder;
|
||||
}
|
||||
|
||||
@Override
|
||||
public WildcardFieldMapper build(BuilderContext context) {
|
||||
setupFieldType(context);
|
||||
setupFieldType(context);
|
||||
if (normalizerName != null) {
|
||||
NamedAnalyzer normalizer = indexAnalyzers.getNormalizer(normalizerName);
|
||||
if (normalizer == null) {
|
||||
throw new MapperParsingException("normalizer [" + normalizerName + "] not found for field [" + name + "]");
|
||||
}
|
||||
fieldType().setNormalizer(normalizer);
|
||||
}
|
||||
return new WildcardFieldMapper(
|
||||
name, fieldType, defaultFieldType, ignoreAbove,
|
||||
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
|
||||
|
@ -188,6 +207,11 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
if (propName.equals("ignore_above")) {
|
||||
builder.ignoreAbove(XContentMapValues.nodeIntegerValue(propNode, -1));
|
||||
iterator.remove();
|
||||
} else if (propName.equals("normalizer")) {
|
||||
if (propNode != null) {
|
||||
builder.normalizer(parserContext.getIndexAnalyzers(), propNode.toString());
|
||||
}
|
||||
iterator.remove();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -198,6 +222,8 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
public static final char TOKEN_START_OR_END_CHAR = 0;
|
||||
|
||||
public static final class WildcardFieldType extends MappedFieldType {
|
||||
|
||||
private NamedAnalyzer normalizer = null;
|
||||
|
||||
public WildcardFieldType() {
|
||||
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
|
||||
|
@ -206,6 +232,7 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
|
||||
protected WildcardFieldType(WildcardFieldType ref) {
|
||||
super(ref);
|
||||
this.normalizer = ref.normalizer;
|
||||
}
|
||||
|
||||
public WildcardFieldType clone() {
|
||||
|
@ -213,7 +240,39 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (super.equals(o) == false) {
|
||||
return false;
|
||||
}
|
||||
WildcardFieldType other = (WildcardFieldType) o;
|
||||
return Objects.equals(normalizer, other.normalizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return 31 * super.hashCode() + Objects.hash(normalizer);
|
||||
}
|
||||
|
||||
private NamedAnalyzer normalizer() {
|
||||
return normalizer;
|
||||
}
|
||||
|
||||
public void setNormalizer(NamedAnalyzer normalizer) {
|
||||
checkIfFrozen();
|
||||
this.normalizer = normalizer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkCompatibility(MappedFieldType otherFT, List<String> conflicts) {
|
||||
super.checkCompatibility(otherFT, conflicts);
|
||||
WildcardFieldType other = (WildcardFieldType) otherFT;
|
||||
if (Objects.equals(normalizer, other.normalizer) == false) {
|
||||
conflicts.add("mapper [" + name() + "] has different [normalizer]");
|
||||
}
|
||||
}
|
||||
|
||||
// Holds parsed information about the wildcard pattern
|
||||
static class PatternStructure {
|
||||
boolean openStart, openEnd, hasSymbols;
|
||||
|
@ -327,6 +386,9 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
|
||||
@Override
|
||||
public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) {
|
||||
if (normalizer != null) {
|
||||
wildcardPattern = StringFieldType.normalizeWildcardPattern(name(), wildcardPattern, normalizer);
|
||||
}
|
||||
PatternStructure patternStructure = new PatternStructure(wildcardPattern);
|
||||
ArrayList<String> tokens = new ArrayList<>();
|
||||
|
||||
|
@ -467,7 +529,32 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
CircuitBreakerService breakerService, MapperService mapperService) {
|
||||
return new WildcardBytesBinaryDVIndexFieldData(indexSettings.getIndex(), fieldType.name());
|
||||
}};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
String normalize(String value) throws IOException {
|
||||
if (normalizer != null) {
|
||||
try (TokenStream ts = normalizer.tokenStream(name(), value)) {
|
||||
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
ts.reset();
|
||||
if (ts.incrementToken() == false) {
|
||||
throw new IllegalStateException("The normalization token stream is "
|
||||
+ "expected to produce exactly 1 token, but got 0 for analyzer "
|
||||
+ normalizer + " and input \"" + value + "\"");
|
||||
}
|
||||
final String newValue = termAtt.toString();
|
||||
if (ts.incrementToken()) {
|
||||
throw new IllegalStateException("The normalization token stream is "
|
||||
+ "expected to produce exactly 1 token, but got 2+ for analyzer "
|
||||
+ normalizer + " and input \"" + value + "\"");
|
||||
}
|
||||
ts.end();
|
||||
return newValue;
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static class WildcardBytesBinaryDVIndexFieldData extends BytesBinaryDVIndexFieldData{
|
||||
|
@ -521,6 +608,11 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) {
|
||||
builder.field("ignore_above", ignoreAbove);
|
||||
}
|
||||
if (fieldType().normalizer() != null) {
|
||||
builder.field("normalizer", fieldType().normalizer().name());
|
||||
} else if (includeDefaults) {
|
||||
builder.nullField("normalizer");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -544,10 +636,11 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
// For internal use by Lucene only - used to define ngram index
|
||||
final MappedFieldType ngramFieldType;
|
||||
|
||||
void createFields(String value, Document parseDoc, List<IndexableField>fields) {
|
||||
void createFields(String value, Document parseDoc, List<IndexableField>fields) throws IOException {
|
||||
if (value == null || value.length() > ignoreAbove) {
|
||||
return;
|
||||
}
|
||||
value = fieldType().normalize(value);
|
||||
String ngramValue = TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
|
||||
Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType);
|
||||
fields.add(ngramField);
|
||||
|
|
Loading…
Reference in New Issue