Backport of new wildcard field type (#53590)

* New wildcard field optimised for wildcard queries (#49993)

Indexes values using size 3 ngrams and also stores the full original as a binary doc value.
Wildcard queries operate by using a cheap approximation query on the ngram field followed up by a more expensive verification query using an automaton on the binary doc values.  Also supports aggregations and sorting.
This commit is contained in:
markharwood 2020-03-16 15:07:13 +00:00 committed by GitHub
parent a906f8a0e4
commit 2c74f3e22c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 1365 additions and 17 deletions

View File

@ -7,7 +7,7 @@ document:
[float]
=== Core datatypes
string:: <<text,`text`>> and <<keyword,`keyword`>>
string:: <<text,`text`>>, <<keyword,`keyword`>> and <<wildcard,`wildcard`>>
<<number>>:: `long`, `integer`, `short`, `byte`, `double`, `float`, `half_float`, `scaled_float`
<<date>>:: `date`
<<date_nanos>>:: `date_nanos`
@ -135,3 +135,5 @@ include::types/token-count.asciidoc[]
include::types/shape.asciidoc[]
include::types/constant-keyword.asciidoc[]
include::types/wildcard.asciidoc[]

View File

@ -0,0 +1,55 @@
[role="xpack"]
[testenv="basic"]
[[wildcard]]
=== Wildcard datatype
++++
<titleabbrev>Wildcard</titleabbrev>
++++
A `wildcard` field stores values optimised for wildcard grep-like queries.
Wildcard queries are possible on other field types but suffer from constraints:
* `text` fields limit matching of any wildcard expressions to individual tokens rather than the original whole value held in a field
* `keyword` fields are untokenized but slow at performing wildcard queries (especially patterns with leading wildcards).
Internally the `wildcard` field indexes the whole field value using ngrams and stores the full string.
The index is used as a rough filter to cut down the number of values that are then checked by retrieving and checking the full values.
This field is especially well suited to run grep-like queries on log lines. Storage costs are typically lower than those of `keyword`
fields but search speeds for exact matches on full terms are slower.
You index and search a wildcard field as follows
[source,console]
--------------------------------------------------
PUT my_index
{
"mappings": {
"properties": {
"my_wildcard": {
"type": "wildcard"
}
}
}
}
PUT my_index/_doc/1
{
"my_wildcard" : "This string can be quite lengthy"
}
POST my_index/_doc/_search
{
"query": {
"wildcard" : {
"value": "*quite*lengthy"
}
}
}
--------------------------------------------------
==== Limitations
* `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries.

View File

@ -21,8 +21,6 @@ package org.elasticsearch.index.fielddata.plain;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.search.SortedSetSelector;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.index.Index;
@ -54,20 +52,7 @@ public class BinaryDVIndexFieldData extends DocValuesIndexFieldData implements I
public SortField sortField(@Nullable Object missingValue, MultiValueMode sortMode, XFieldComparatorSource.Nested nested,
boolean reverse) {
XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue, sortMode, nested);
/**
* Check if we can use a simple {@link SortedSetSortField} compatible with index sorting and
* returns a custom sort field otherwise.
*/
if (nested != null ||
(sortMode != MultiValueMode.MAX && sortMode != MultiValueMode.MIN) ||
(source.sortMissingFirst(missingValue) == false && source.sortMissingLast(missingValue) == false)) {
return new SortField(getFieldName(), source, reverse);
}
SortField sortField = new SortedSetSortField(fieldName, reverse,
sortMode == MultiValueMode.MAX ? SortedSetSelector.Type.MAX : SortedSetSelector.Type.MIN);
sortField.setMissingValue(source.sortMissingLast(missingValue) ^ reverse ?
SortedSetSortField.STRING_LAST : SortedSetSortField.STRING_FIRST);
return sortField;
return new SortField(getFieldName(), source, reverse);
}
@Override

View File

@ -613,6 +613,16 @@ public class XPackLicenseState {
public boolean isVectorsAllowed() {
return allowForAllLicenses();
}
/**
* Determine if Wildcard support should be enabled.
* <p>
* Wildcard is available for all license types except {@link OperationMode#MISSING}
*/
public synchronized boolean isWildcardAllowed() {
return status.active;
}
public boolean isOdbcAllowed() {
return isAllowedByLicense(OperationMode.PLATINUM);

View File

@ -0,0 +1,218 @@
setup:
- skip:
features: headers
version: " - 7.6.99"
reason: "wildcard fields were added from 7.7"
- do:
indices.create:
index: test-index
body:
settings:
number_of_replicas: 0
mappings:
properties:
my_wildcard:
type: wildcard
- do:
index:
index: test-index
id: 1
body:
my_wildcard: hello world
- do:
index:
index: test-index
id: 2
body:
my_wildcard: goodbye world
- do:
indices.refresh: {}
---
"Short prefix query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard: {value: "hel*" }
- match: {hits.total.value: 1}
---
"Long prefix query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard: {value: "hello wor*" }
- match: {hits.total.value: 1}
---
"Short unrooted query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard: {value: "*ello*" }
- match: {hits.total.value: 1}
---
"Long unrooted query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard: {value: "*ello worl*" }
- match: {hits.total.value: 1}
---
"Short suffix query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard: {value: "*ld" }
- match: {hits.total.value: 2}
---
"Long suffix query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard: {value: "*ello world" }
- match: {hits.total.value: 1}
---
"No wildcard wildcard query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard: {value: "hello world" }
- match: {hits.total.value: 1}
---
"Term query on wildcard field":
- do:
search:
body:
track_total_hits: true
query:
term:
my_wildcard: "hello world"
- match: {hits.total.value: 1}
---
"Terms query on wildcard field":
- do:
search:
body:
track_total_hits: true
query:
terms:
my_wildcard: ["hello world", "does not exist"]
- match: {hits.total.value: 1}
---
"Prefix query on wildcard field":
- do:
search:
body:
track_total_hits: true
query:
prefix:
my_wildcard:
value: "hell*"
- match: {hits.total.value: 1}
---
"Sequence fail":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard: {value: "*world*hello*" }
- match: {hits.total.value: 0}
---
"Aggs work":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard: {value: "*world*" }
aggs:
top_vals:
terms: {field: "my_wildcard" }
- match: {hits.total.value: 2}
- length: { aggregations.top_vals.buckets: 2 }
---
"Sort works":
- do:
search:
body:
track_total_hits: true
sort: [ { "my_wildcard": "desc" } ]
- match: { hits.total.value: 2 }
- length: { hits.hits: 2 }
- match: { hits.hits.0._id: "1" }
- match: { hits.hits.1._id: "2" }
- do:
search:
body:
track_total_hits: true
sort: [ { "my_wildcard": "asc" } ]
- match: { hits.total.value: 2 }
- length: { hits.hits: 2 }
- match: { hits.hits.0._id: "2" }
- match: { hits.hits.1._id: "1" }

View File

@ -0,0 +1,18 @@
evaluationDependsOn(xpackModule('core'))
apply plugin: 'elasticsearch.esplugin'
esplugin {
name 'wildcard'
description 'A plugin for a keyword field type with efficient wildcard search'
classname 'org.elasticsearch.xpack.wildcard.Wildcard'
extendedPlugins = ['x-pack-core']
}
archivesBaseName = 'x-pack-wildcard'
dependencies {
compileOnly project(path: xpackModule('core'), configuration: 'default')
testCompile project(path: xpackModule('core'), configuration: 'testArtifacts')
}
integTest.enabled = false

View File

@ -0,0 +1,31 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.wildcard;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.plugins.MapperPlugin;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
public class Wildcard extends Plugin implements MapperPlugin {
public Wildcard(Settings settings) {
}
@Override
public Map<String, Mapper.TypeParser> getMappers() {
Map<String, Mapper.TypeParser> mappers = new LinkedHashMap<>();
mappers.put(WildcardFieldMapper.CONTENT_TYPE, new WildcardFieldMapper.TypeParser());
return Collections.unmodifiableMap(mappers);
}
}

View File

@ -0,0 +1,104 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.wildcard.mapper;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import java.io.IOException;
import java.util.Objects;
/**
* Query that runs an Automaton across all binary doc values.
* Expensive to run so normally used in conjunction with more selective query clauses.
*/
public class AutomatonQueryOnBinaryDv extends Query {
private final String field;
private final String matchPattern;
private final Automaton automaton;
public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton) {
this.field = field;
this.matchPattern = matchPattern;
this.automaton = automaton;
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
return new ConstantScoreWeight(this, boost) {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
ByteArrayDataInput badi = new ByteArrayDataInput();
final BinaryDocValues values = DocValues.getBinary(context.reader(), field);
TwoPhaseIterator twoPhase = new TwoPhaseIterator(values) {
@Override
public boolean matches() throws IOException {
BytesRef arrayOfValues = values.binaryValue();
badi.reset(arrayOfValues.bytes);
badi.setPosition(arrayOfValues.offset);
int size = badi.readVInt();
for (int i=0; i< size; i++) {
int valLength = badi.readVInt();
if (bytesMatcher.run(arrayOfValues.bytes, badi.getPosition(), valLength)) {
return true;
}
badi.skipBytes(valLength);
}
return false;
}
@Override
public float matchCost() {
// TODO: how can we compute this?
return 1000f;
}
};
return new ConstantScoreScorer(this, score(), scoreMode, twoPhase);
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return true;
}
};
}
@Override
public String toString(String field) {
return field+":"+matchPattern;
}
@Override
public boolean equals(Object obj) {
AutomatonQueryOnBinaryDv other = (AutomatonQueryOnBinaryDv) obj;
return Objects.equals(field, other.field) && Objects.equals(matchPattern, other.matchPattern);
}
@Override
public int hashCode() {
return Objects.hash(field, matchPattern);
}
}

View File

@ -0,0 +1,575 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.wildcard.mapper;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.automaton.Automaton;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AnalyzerScope;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource;
import org.elasticsearch.index.fielddata.plain.BytesBinaryDVIndexFieldData;
import org.elasticsearch.index.mapper.BinaryFieldMapper.CustomBinaryDocValuesField;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.ParseContext.Document;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.similarity.SimilarityProvider;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
import org.elasticsearch.search.MultiValueMode;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import static org.elasticsearch.index.mapper.TypeParsers.parseField;
/**
* A {@link FieldMapper} for indexing fields with ngrams for efficient wildcard matching
*/
public class WildcardFieldMapper extends FieldMapper {
public static final String CONTENT_TYPE = "wildcard";
public static short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10;
public static final int NGRAM_SIZE = 3;
static final NamedAnalyzer WILDCARD_ANALYZER = new NamedAnalyzer("_wildcard", AnalyzerScope.GLOBAL, new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE);
return new TokenStreamComponents(tokenizer);
}
});
public static class Defaults {
public static final MappedFieldType FIELD_TYPE = new WildcardFieldType();
static {
FIELD_TYPE.setTokenized(false);
FIELD_TYPE.setIndexAnalyzer(WILDCARD_ANALYZER);
FIELD_TYPE.setSearchAnalyzer(Lucene.KEYWORD_ANALYZER);
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS);
FIELD_TYPE.setStoreTermVectorOffsets(false);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.freeze();
}
public static final int IGNORE_ABOVE = Integer.MAX_VALUE;
}
public static class Builder extends FieldMapper.Builder<Builder, WildcardFieldMapper> {
protected int ignoreAbove = Defaults.IGNORE_ABOVE;
public Builder(String name) {
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
builder = this;
}
@Override
public Builder docValues(boolean docValues) {
if (docValues == false) {
throw new MapperParsingException("The field [" + name + "] cannot have doc values = false");
}
return this;
}
@Override
public Builder indexOptions(IndexOptions indexOptions) {
if (indexOptions != IndexOptions.DOCS) {
throw new MapperParsingException("The field [" + name + "] cannot have indexOptions = " + indexOptions);
}
return this;
}
@Override
public Builder store(boolean store) {
if (store) {
throw new MapperParsingException("The field [" + name + "] cannot have store = true");
}
return this;
}
@Override
public Builder similarity(SimilarityProvider similarity) {
throw new MapperParsingException("The field [" + name + "] cannot have custom similarities");
}
@Override
public Builder index(boolean index) {
if (index == false) {
throw new MapperParsingException("The field [" + name + "] cannot have index = false");
}
return this;
}
public Builder ignoreAbove(int ignoreAbove) {
if (ignoreAbove < 0) {
throw new IllegalArgumentException("[ignore_above] must be positive, got " + ignoreAbove);
}
this.ignoreAbove = ignoreAbove;
return this;
}
@Override
protected void setupFieldType(BuilderContext context) {
super.setupFieldType(context);
fieldType().setHasDocValues(true);
fieldType().setTokenized(false);
fieldType().setIndexOptions(IndexOptions.DOCS);
}
@Override
public WildcardFieldType fieldType() {
return (WildcardFieldType) super.fieldType();
}
@Override
public WildcardFieldMapper build(BuilderContext context) {
setupFieldType(context);
return new WildcardFieldMapper(
name, fieldType, defaultFieldType, ignoreAbove,
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
}
}
public static class TypeParser implements Mapper.TypeParser {
@Override
public Mapper.Builder<?, ?> parse(String name, Map<String, Object> node, ParserContext parserContext)
throws MapperParsingException {
WildcardFieldMapper.Builder builder = new WildcardFieldMapper.Builder(name);
parseField(builder, name, node, parserContext);
for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet().iterator(); iterator.hasNext();) {
Map.Entry<String, Object> entry = iterator.next();
String propName = entry.getKey();
Object propNode = entry.getValue();
if (propName.equals("ignore_above")) {
builder.ignoreAbove(XContentMapValues.nodeIntegerValue(propNode, -1));
iterator.remove();
}
}
return builder;
}
}
public static final char TOKEN_START_OR_END_CHAR = 0;
public static final class WildcardFieldType extends MappedFieldType {
public WildcardFieldType() {
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
setSearchAnalyzer(Lucene.KEYWORD_ANALYZER);
}
protected WildcardFieldType(WildcardFieldType ref) {
super(ref);
}
public WildcardFieldType clone() {
WildcardFieldType result = new WildcardFieldType(this);
return result;
}
// Holds parsed information about the wildcard pattern
static class PatternStructure {
boolean openStart, openEnd, hasSymbols;
int lastGap =0;
int wildcardCharCount, wildcardStringCount;
String[] fragments;
Integer [] precedingGapSizes;
final String pattern;
@SuppressWarnings("fallthrough") // Intentionally uses fallthrough mirroring implementation in Lucene's WildcardQuery
PatternStructure (String wildcardText) {
this.pattern = wildcardText;
ArrayList<String> fragmentList = new ArrayList<>();
ArrayList<Integer> precedingGapSizeList = new ArrayList<>();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < wildcardText.length();) {
final int c = wildcardText.codePointAt(i);
int length = Character.charCount(c);
switch (c) {
case WildcardQuery.WILDCARD_STRING:
if (i == 0) {
openStart = true;
}
openEnd = true;
hasSymbols = true;
wildcardStringCount++;
if (sb.length() > 0) {
precedingGapSizeList.add(lastGap);
fragmentList.add(sb.toString());
sb = new StringBuilder();
}
lastGap = Integer.MAX_VALUE;
break;
case WildcardQuery.WILDCARD_CHAR:
if (i == 0) {
openStart = true;
}
hasSymbols = true;
wildcardCharCount++;
openEnd = true;
if (sb.length() > 0) {
precedingGapSizeList.add(lastGap);
fragmentList.add(sb.toString());
sb = new StringBuilder();
lastGap = 0;
}
if (lastGap != Integer.MAX_VALUE) {
lastGap++;
}
break;
case WildcardQuery.WILDCARD_ESCAPE:
// add the next codepoint instead, if it exists
if (i + length < wildcardText.length()) {
final int nextChar = wildcardText.codePointAt(i + length);
length += Character.charCount(nextChar);
sb.append(Character.toChars(nextChar));
openEnd = false;
break;
} // else fallthru, lenient parsing with a trailing \
default:
openEnd = false;
sb.append(Character.toChars(c));
}
i += length;
}
if (sb.length() > 0) {
precedingGapSizeList.add(lastGap);
fragmentList.add(sb.toString());
lastGap = 0;
}
fragments = fragmentList.toArray(new String[0]);
precedingGapSizes = precedingGapSizeList.toArray(new Integer[0]);
}
public boolean needsVerification() {
// Return true if term queries are not enough evidence
if (fragments.length == 1 && wildcardCharCount == 0) {
// The one case where we don't need verification is when
// we have a single fragment and no ? characters
return false;
}
return true;
}
// Returns number of positions for last gap (Integer.MAX means unlimited gap)
public int getPrecedingGapSize(int fragmentNum) {
return precedingGapSizes[fragmentNum];
}
public boolean isMatchAll() {
return fragments.length == 0 && wildcardStringCount >0 && wildcardCharCount ==0;
}
@Override
public int hashCode() {
return pattern.hashCode();
}
@Override
public boolean equals(Object obj) {
PatternStructure other = (PatternStructure) obj;
return pattern.equals(other.pattern);
}
}
@Override
public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) {
PatternStructure patternStructure = new PatternStructure(wildcardPattern);
ArrayList<String> tokens = new ArrayList<>();
for (int i = 0; i < patternStructure.fragments.length; i++) {
String fragment = patternStructure.fragments[i];
int fLength = fragment.length();
if (fLength == 0) {
continue;
}
// Add any start/end of string character
if (i == 0 && patternStructure.openStart == false) {
// Start-of-string anchored (is not a leading wildcard)
fragment = TOKEN_START_OR_END_CHAR + fragment;
}
if (patternStructure.openEnd == false && i == patternStructure.fragments.length - 1) {
// End-of-string anchored (is not a trailing wildcard)
fragment = fragment + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
}
if (fragment.codePointCount(0, fragment.length()) <= NGRAM_SIZE) {
tokens.add(fragment);
} else {
// Break fragment into multiple Ngrams
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment);
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
String lastUnusedToken = null;
try {
tokenizer.reset();
boolean takeThis = true;
// minimise number of terms searched - eg for "12345" and 3grams we only need terms
// `123` and `345` - no need to search for 234. We take every other ngram.
while (tokenizer.incrementToken()) {
String tokenValue = termAtt.toString();
if (takeThis) {
tokens.add(tokenValue);
} else {
lastUnusedToken = tokenValue;
}
// alternate
takeThis = !takeThis;
}
if (lastUnusedToken != null) {
// given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing
// `ake` to complete the logic.
tokens.add(lastUnusedToken);
}
tokenizer.end();
tokenizer.close();
} catch (IOException ioe) {
throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment [" + fragment + "]");
}
}
}
if (patternStructure.isMatchAll()) {
return new MatchAllDocsQuery();
}
BooleanQuery approximation = createApproximationQuery(tokens);
if (approximation.clauses().size() > 1 || patternStructure.needsVerification()) {
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST));
Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
verifyingBuilder.add(new BooleanClause(new AutomatonQueryOnBinaryDv(name(), wildcardPattern, automaton), Occur.MUST));
return verifyingBuilder.build();
}
return approximation;
}
private BooleanQuery createApproximationQuery(ArrayList<String> tokens) {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
if (tokens.size() <= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
for (String token : tokens) {
addClause(token, bqBuilder);
}
return bqBuilder.build();
}
// Thin out the number of clauses using a selection spread evenly across the range
float step = (float) (tokens.size() - 1) / (float) (MAX_CLAUSES_IN_APPROXIMATION_QUERY - 1); // set step size
for (int i = 0; i < MAX_CLAUSES_IN_APPROXIMATION_QUERY; i++) {
addClause(tokens.get(Math.round(step * i)), bqBuilder); // add each element of a position which is a multiple of step
}
// TODO we can be smarter about pruning here. e.g.
// * Avoid wildcard queries if there are sufficient numbers of other terms that are full 3grams that are cheaper term queries
// * We can select terms on their scarcity rather than even spreads across the search string.
return bqBuilder.build();
}
private void addClause(String token, BooleanQuery.Builder bqBuilder) {
assert token.codePointCount(0, token.length()) <= NGRAM_SIZE;
if (token.codePointCount(0, token.length()) == NGRAM_SIZE) {
TermQuery tq = new TermQuery(new Term(name(), token));
bqBuilder.add(new BooleanClause(tq, Occur.MUST));
} else {
WildcardQuery wq = new WildcardQuery(new Term(name(), token + "*"));
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
bqBuilder.add(new BooleanClause(wq, Occur.MUST));
}
}
@Override
public String typeName() {
return CONTENT_TYPE;
}
@Override
public Query existsQuery(QueryShardContext context) {
return new DocValuesFieldExistsQuery(name());
}
@Override
public Query termQuery(Object value, QueryShardContext context) {
return wildcardQuery(BytesRefs.toString(value), MultiTermQuery.CONSTANT_SCORE_REWRITE, context);
}
@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
return wildcardQuery(value + "*", method, context);
}
@Override
public Query termsQuery(List<?> values, QueryShardContext context) {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
for (Object value : values) {
bq.add(termQuery(value, context), Occur.SHOULD);
}
return new ConstantScoreQuery(bq.build());
}
@Override
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
failIfNoDocValues();
return new IndexFieldData.Builder() {
@Override
public IndexFieldData<?> build(IndexSettings indexSettings, MappedFieldType fieldType, IndexFieldDataCache cache,
CircuitBreakerService breakerService, MapperService mapperService) {
return new WildcardBytesBinaryDVIndexFieldData(indexSettings.getIndex(), fieldType.name());
}};
}
}
static class WildcardBytesBinaryDVIndexFieldData extends BytesBinaryDVIndexFieldData{
WildcardBytesBinaryDVIndexFieldData(Index index, String fieldName) {
super(index, fieldName);
}
@Override
public SortField sortField(Object missingValue, MultiValueMode sortMode, Nested nested, boolean reverse) {
XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue,
sortMode, nested);
return new SortField(getFieldName(), source, reverse);
}
}
private int ignoreAbove;
private WildcardFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
int ignoreAbove, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
this.ignoreAbove = ignoreAbove;
assert fieldType.indexOptions() == IndexOptions.DOCS;
ngramFieldType = fieldType.clone();
ngramFieldType.setTokenized(true);
ngramFieldType.freeze();
}
/** Values that have more chars than the return value of this method will
* be skipped at parsing time. */
// pkg-private for testing
int ignoreAbove() {
return ignoreAbove;
}
@Override
protected WildcardFieldMapper clone() {
return (WildcardFieldMapper) super.clone();
}
@Override
public WildcardFieldType fieldType() {
return (WildcardFieldType) super.fieldType();
}
@Override
protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
super.doXContentBody(builder, includeDefaults, params);
if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) {
builder.field("ignore_above", ignoreAbove);
}
}
@Override
protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException {
final String value;
if (context.externalValueSet()) {
value = context.externalValue().toString();
} else {
XContentParser parser = context.parser();
if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
value = fieldType().nullValueAsString();
} else {
value = parser.textOrNull();
}
}
ParseContext.Document parseDoc = context.doc();
createFields(value, parseDoc, fields);
}
// For internal use by Lucene only - used to define ngram index
final MappedFieldType ngramFieldType;
void createFields(String value, Document parseDoc, List<IndexableField>fields) {
if (value == null || value.length() > ignoreAbove) {
return;
}
String ngramValue = TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType);
fields.add(ngramField);
CustomBinaryDocValuesField dvField = (CustomBinaryDocValuesField) parseDoc.getByKey(fieldType().name());
if (dvField == null) {
dvField = new CustomBinaryDocValuesField(fieldType().name(), value.getBytes(StandardCharsets.UTF_8));
parseDoc.addWithKey(fieldType().name(), dvField);
} else {
dvField.add(value.getBytes(StandardCharsets.UTF_8));
}
}
@Override
protected String contentType() {
return CONTENT_TYPE;
}
@Override
protected void doMerge(Mapper mergeWith) {
super.doMerge(mergeWith);
this.ignoreAbove = ((WildcardFieldMapper) mergeWith).ignoreAbove;
}
}

View File

@ -0,0 +1,331 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.wildcard.mapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.cache.bitset.BitsetFilterCache;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.mapper.ContentPath;
import org.elasticsearch.index.mapper.KeywordFieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.search.sort.FieldSortBuilder;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper.Builder;
import org.junit.Before;
import org.mockito.Mockito;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.function.BiFunction;
import static org.hamcrest.Matchers.equalTo;
public class WildcardFieldMapperTests extends ESTestCase {
private static final String KEYWORD_FIELD_NAME = "keyword_field";
private static final String WILDCARD_FIELD_NAME = "wildcard_field";
static final int MAX_FIELD_LENGTH = 100;
static WildcardFieldMapper wildcardFieldType;
static KeywordFieldMapper keywordFieldType;
@Override
@Before
public void setUp() throws Exception {
Builder builder = new WildcardFieldMapper.Builder(WILDCARD_FIELD_NAME);
builder.ignoreAbove(MAX_FIELD_LENGTH);
wildcardFieldType = builder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0)));
org.elasticsearch.index.mapper.KeywordFieldMapper.Builder kwBuilder = new KeywordFieldMapper.Builder(KEYWORD_FIELD_NAME);
keywordFieldType = kwBuilder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0)));
super.setUp();
}
public void testIllegalDocValuesArgument() {
Builder ft = new WildcardFieldMapper.Builder("test");
MapperParsingException e = expectThrows(MapperParsingException.class,
() -> ft.docValues(false));
assertEquals("The field [test] cannot have doc values = false", e.getMessage());
}
public void testIllegalIndexedArgument() {
Builder ft = new WildcardFieldMapper.Builder("test");
MapperParsingException e = expectThrows(MapperParsingException.class,
() -> ft.index(false));
assertEquals("The field [test] cannot have index = false", e.getMessage());
}
public void testTooBigKeywordField() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
// Create a string that is too large and will not be indexed
String docContent = randomABString(MAX_FIELD_LENGTH + 1);
Document doc = new Document();
ParseContext.Document parseDoc = new ParseContext.Document();
addFields(parseDoc, doc, docContent);
indexDoc(parseDoc, doc, iw);
iw.forceMerge(1);
DirectoryReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery("*a*", null, null);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
reader.close();
dir.close();
}
//Test long query strings don't cause exceptions
public void testTooBigQueryField() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
// Create a string that is too large and will not be indexed
String docContent = randomABString(10);
Document doc = new Document();
ParseContext.Document parseDoc = new ParseContext.Document();
addFields(parseDoc, doc, docContent);
indexDoc(parseDoc, doc, iw);
iw.forceMerge(1);
DirectoryReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();
String queryString = randomABString((BooleanQuery.getMaxClauseCount() * 2) + 1);
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(queryString, null, null);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
reader.close();
dir.close();
}
public void testSearchResultsVersusKeywordField() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
int numDocs = 100;
HashSet<String> values = new HashSet<>();
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
ParseContext.Document parseDoc = new ParseContext.Document();
String docContent = randomABString(1 + randomInt(MAX_FIELD_LENGTH - 1));
if (values.contains(docContent) == false) {
addFields(parseDoc, doc, docContent);
values.add(docContent);
}
// Occasionally add a multi-value field
if (randomBoolean()) {
docContent = randomABString(1 + randomInt(MAX_FIELD_LENGTH - 1));
if (values.contains(docContent) == false) {
addFields(parseDoc, doc, docContent);
values.add(docContent);
}
}
indexDoc(parseDoc, doc, iw);
}
iw.forceMerge(1);
DirectoryReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();
int numSearches = 100;
for (int i = 0; i < numSearches; i++) {
String randomWildcardPattern = getRandomWildcardPattern();
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(randomWildcardPattern, null, null);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.INDEXORDER);
Query keywordFieldQuery = new WildcardQuery(new Term(KEYWORD_FIELD_NAME, randomWildcardPattern));
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.INDEXORDER);
assertThat(kwTopDocs.totalHits.value, equalTo(wildcardFieldTopDocs.totalHits.value));
HashSet<Integer> expectedDocs = new HashSet<>();
for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
expectedDocs.add(topDoc.doc);
}
for (ScoreDoc wcTopDoc : wildcardFieldTopDocs.scoreDocs) {
assertTrue(expectedDocs.remove(wcTopDoc.doc));
}
assertThat(expectedDocs.size(), equalTo(0));
}
//Test keyword and wildcard sort operations are also equivalent
QueryShardContext shardContextMock = createMockShardContext();
FieldSortBuilder wildcardSortBuilder = new FieldSortBuilder(WILDCARD_FIELD_NAME);
SortField wildcardSortField = wildcardSortBuilder.build(shardContextMock).field;
ScoreDoc[] wildcardHits = searcher.search(new MatchAllDocsQuery(), numDocs, new Sort(wildcardSortField)).scoreDocs;
FieldSortBuilder keywordSortBuilder = new FieldSortBuilder(KEYWORD_FIELD_NAME);
SortField keywordSortField = keywordSortBuilder.build(shardContextMock).field;
ScoreDoc[] keywordHits = searcher.search(new MatchAllDocsQuery(), numDocs, new Sort(keywordSortField)).scoreDocs;
assertThat(wildcardHits.length, equalTo(keywordHits.length));
for (int i = 0; i < wildcardHits.length; i++) {
assertThat(wildcardHits[i].doc, equalTo(keywordHits[i].doc));
}
reader.close();
dir.close();
}
protected MappedFieldType provideMappedFieldType(String name) {
if (name.equals(WILDCARD_FIELD_NAME)) {
return wildcardFieldType.fieldType();
} else {
return keywordFieldType.fieldType();
}
}
protected final QueryShardContext createMockShardContext() {
Index index = new Index(randomAlphaOfLengthBetween(1, 10), "_na_");
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings(index,
Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build());
BitsetFilterCache bitsetFilterCache = new BitsetFilterCache(idxSettings, Mockito.mock(BitsetFilterCache.Listener.class));
BiFunction<MappedFieldType, String, IndexFieldData<?>> indexFieldDataLookup = (fieldType, fieldIndexName) -> {
IndexFieldData.Builder builder = fieldType.fielddataBuilder(fieldIndexName);
return builder.build(idxSettings, fieldType, new IndexFieldDataCache.None(), null, null);
};
return new QueryShardContext(0, idxSettings, BigArrays.NON_RECYCLING_INSTANCE, bitsetFilterCache, indexFieldDataLookup,
null, null, null, xContentRegistry(), null, null, null,
() -> randomNonNegativeLong(), null, null, () -> true) {
@Override
public MappedFieldType fieldMapper(String name) {
return provideMappedFieldType(name);
}
};
}
private void addFields(ParseContext.Document parseDoc, Document doc, String docContent) throws IOException {
ArrayList<IndexableField> fields = new ArrayList<>();
wildcardFieldType.createFields(docContent, parseDoc, fields);
for (IndexableField indexableField : fields) {
doc.add(indexableField);
}
// Add keyword fields too
doc.add(new SortedSetDocValuesField(KEYWORD_FIELD_NAME, new BytesRef(docContent)));
doc.add(new StringField(KEYWORD_FIELD_NAME, docContent, Field.Store.YES));
}
private void indexDoc(ParseContext.Document parseDoc, Document doc, RandomIndexWriter iw) throws IOException {
IndexableField field = parseDoc.getByKey(wildcardFieldType.name());
if (field != null) {
doc.add(field);
}
iw.addDocument(doc);
}
protected IndexSettings createIndexSettings() {
return new IndexSettings(
IndexMetaData.builder("_index").settings(Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT))
.numberOfShards(1).numberOfReplicas(0).creationDate(System.currentTimeMillis()).build(),
Settings.EMPTY);
}
static String randomABString(int minLength) {
StringBuilder sb = new StringBuilder();
while (sb.length() < minLength) {
if (randomBoolean()) {
sb.append("a");
} else {
sb.append("b");
}
}
return sb.toString();
}
private void randomSyntaxChar(StringBuilder sb) {
switch (randomInt(3)) {
case 0:
sb.append(WildcardQuery.WILDCARD_CHAR);
break;
case 1:
sb.append(WildcardQuery.WILDCARD_STRING);
break;
case 2:
sb.append(WildcardQuery.WILDCARD_ESCAPE);
sb.append(WildcardQuery.WILDCARD_STRING);
break;
case 3:
sb.append(WildcardQuery.WILDCARD_ESCAPE);
sb.append(WildcardQuery.WILDCARD_CHAR);
break;
}
}
private String getRandomWildcardPattern() {
StringBuilder sb = new StringBuilder();
int numFragments = 1 + randomInt(4);
if (randomInt(10) == 1) {
randomSyntaxChar(sb);
}
for (int i = 0; i < numFragments; i++) {
if (i > 0) {
randomSyntaxChar(sb);
}
sb.append(randomABString(1 + randomInt(6)));
}
if (randomInt(10) == 1) {
randomSyntaxChar(sb);
}
return sb.toString();
}
}

View File

@ -0,0 +1,19 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.wildcard.mapper;
import org.elasticsearch.index.mapper.FieldTypeTestCase;
import org.elasticsearch.index.mapper.MappedFieldType;
public class WildcardFieldTypeTests extends FieldTypeTestCase {
@Override
protected MappedFieldType createDefaultFieldType() {
return new WildcardFieldMapper.WildcardFieldType();
}
}