mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-22 21:05:23 +00:00
Backport of new wildcard field type (#53590)
* New wildcard field optimised for wildcard queries (#49993) Indexes values using size 3 ngrams and also stores the full original as a binary doc value. Wildcard queries operate by using a cheap approximation query on the ngram field followed up by a more expensive verification query using an automaton on the binary doc values. Also supports aggregations and sorting.
This commit is contained in:
parent
a906f8a0e4
commit
2c74f3e22c
@ -7,7 +7,7 @@ document:
|
||||
[float]
|
||||
=== Core datatypes
|
||||
|
||||
string:: <<text,`text`>> and <<keyword,`keyword`>>
|
||||
string:: <<text,`text`>>, <<keyword,`keyword`>> and <<wildcard,`wildcard`>>
|
||||
<<number>>:: `long`, `integer`, `short`, `byte`, `double`, `float`, `half_float`, `scaled_float`
|
||||
<<date>>:: `date`
|
||||
<<date_nanos>>:: `date_nanos`
|
||||
@ -135,3 +135,5 @@ include::types/token-count.asciidoc[]
|
||||
include::types/shape.asciidoc[]
|
||||
|
||||
include::types/constant-keyword.asciidoc[]
|
||||
|
||||
include::types/wildcard.asciidoc[]
|
||||
|
55
docs/reference/mapping/types/wildcard.asciidoc
Normal file
55
docs/reference/mapping/types/wildcard.asciidoc
Normal file
@ -0,0 +1,55 @@
|
||||
[role="xpack"]
|
||||
[testenv="basic"]
|
||||
[[wildcard]]
|
||||
=== Wildcard datatype
|
||||
++++
|
||||
<titleabbrev>Wildcard</titleabbrev>
|
||||
++++
|
||||
|
||||
A `wildcard` field stores values optimised for wildcard grep-like queries.
|
||||
Wildcard queries are possible on other field types but suffer from constraints:
|
||||
* `text` fields limit matching of any wildcard expressions to individual tokens rather than the original whole value held in a field
|
||||
* `keyword` fields are untokenized but slow at performing wildcard queries (especially patterns with leading wildcards).
|
||||
|
||||
Internally the `wildcard` field indexes the whole field value using ngrams and stores the full string.
|
||||
The index is used as a rough filter to cut down the number of values that are then checked by retrieving and checking the full values.
|
||||
This field is especially well suited to run grep-like queries on log lines. Storage costs are typically lower than those of `keyword`
|
||||
fields but search speeds for exact matches on full terms are slower.
|
||||
|
||||
You index and search a wildcard field as follows
|
||||
|
||||
[source,console]
|
||||
--------------------------------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"my_wildcard": {
|
||||
"type": "wildcard"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PUT my_index/_doc/1
|
||||
{
|
||||
"my_wildcard" : "This string can be quite lengthy"
|
||||
}
|
||||
|
||||
POST my_index/_doc/_search
|
||||
{
|
||||
"query": {
|
||||
"wildcard" : {
|
||||
"value": "*quite*lengthy"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
--------------------------------------------------
|
||||
|
||||
|
||||
==== Limitations
|
||||
|
||||
* `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries.
|
||||
|
@ -21,8 +21,6 @@ package org.elasticsearch.index.fielddata.plain;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.SortedSetSortField;
|
||||
import org.apache.lucene.search.SortedSetSelector;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.util.BigArrays;
|
||||
import org.elasticsearch.index.Index;
|
||||
@ -54,20 +52,7 @@ public class BinaryDVIndexFieldData extends DocValuesIndexFieldData implements I
|
||||
public SortField sortField(@Nullable Object missingValue, MultiValueMode sortMode, XFieldComparatorSource.Nested nested,
|
||||
boolean reverse) {
|
||||
XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue, sortMode, nested);
|
||||
/**
|
||||
* Check if we can use a simple {@link SortedSetSortField} compatible with index sorting and
|
||||
* returns a custom sort field otherwise.
|
||||
*/
|
||||
if (nested != null ||
|
||||
(sortMode != MultiValueMode.MAX && sortMode != MultiValueMode.MIN) ||
|
||||
(source.sortMissingFirst(missingValue) == false && source.sortMissingLast(missingValue) == false)) {
|
||||
return new SortField(getFieldName(), source, reverse);
|
||||
}
|
||||
SortField sortField = new SortedSetSortField(fieldName, reverse,
|
||||
sortMode == MultiValueMode.MAX ? SortedSetSelector.Type.MAX : SortedSetSelector.Type.MIN);
|
||||
sortField.setMissingValue(source.sortMissingLast(missingValue) ^ reverse ?
|
||||
SortedSetSortField.STRING_LAST : SortedSetSortField.STRING_FIRST);
|
||||
return sortField;
|
||||
return new SortField(getFieldName(), source, reverse);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -613,6 +613,16 @@ public class XPackLicenseState {
|
||||
public boolean isVectorsAllowed() {
|
||||
return allowForAllLicenses();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determine if Wildcard support should be enabled.
|
||||
* <p>
|
||||
* Wildcard is available for all license types except {@link OperationMode#MISSING}
|
||||
*/
|
||||
public synchronized boolean isWildcardAllowed() {
|
||||
return status.active;
|
||||
}
|
||||
|
||||
public boolean isOdbcAllowed() {
|
||||
return isAllowedByLicense(OperationMode.PLATINUM);
|
||||
|
@ -0,0 +1,218 @@
|
||||
setup:
|
||||
- skip:
|
||||
features: headers
|
||||
version: " - 7.6.99"
|
||||
reason: "wildcard fields were added from 7.7"
|
||||
|
||||
- do:
|
||||
indices.create:
|
||||
index: test-index
|
||||
body:
|
||||
settings:
|
||||
number_of_replicas: 0
|
||||
mappings:
|
||||
properties:
|
||||
my_wildcard:
|
||||
type: wildcard
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
id: 1
|
||||
body:
|
||||
my_wildcard: hello world
|
||||
- do:
|
||||
index:
|
||||
index: test-index
|
||||
id: 2
|
||||
body:
|
||||
my_wildcard: goodbye world
|
||||
|
||||
- do:
|
||||
indices.refresh: {}
|
||||
|
||||
---
|
||||
"Short prefix query":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
wildcard:
|
||||
my_wildcard: {value: "hel*" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 1}
|
||||
|
||||
---
|
||||
"Long prefix query":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
wildcard:
|
||||
my_wildcard: {value: "hello wor*" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 1}
|
||||
|
||||
---
|
||||
"Short unrooted query":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
wildcard:
|
||||
my_wildcard: {value: "*ello*" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 1}
|
||||
|
||||
---
|
||||
"Long unrooted query":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
wildcard:
|
||||
my_wildcard: {value: "*ello worl*" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 1}
|
||||
|
||||
---
|
||||
"Short suffix query":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
wildcard:
|
||||
my_wildcard: {value: "*ld" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 2}
|
||||
|
||||
---
|
||||
"Long suffix query":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
wildcard:
|
||||
my_wildcard: {value: "*ello world" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 1}
|
||||
|
||||
---
|
||||
"No wildcard wildcard query":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
wildcard:
|
||||
my_wildcard: {value: "hello world" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 1}
|
||||
|
||||
---
|
||||
"Term query on wildcard field":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
term:
|
||||
my_wildcard: "hello world"
|
||||
|
||||
|
||||
- match: {hits.total.value: 1}
|
||||
|
||||
---
|
||||
"Terms query on wildcard field":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
terms:
|
||||
my_wildcard: ["hello world", "does not exist"]
|
||||
|
||||
|
||||
- match: {hits.total.value: 1}
|
||||
|
||||
---
|
||||
"Prefix query on wildcard field":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
prefix:
|
||||
my_wildcard:
|
||||
value: "hell*"
|
||||
|
||||
|
||||
- match: {hits.total.value: 1}
|
||||
|
||||
---
|
||||
"Sequence fail":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
wildcard:
|
||||
my_wildcard: {value: "*world*hello*" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 0}
|
||||
|
||||
---
|
||||
"Aggs work":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
query:
|
||||
wildcard:
|
||||
my_wildcard: {value: "*world*" }
|
||||
aggs:
|
||||
top_vals:
|
||||
terms: {field: "my_wildcard" }
|
||||
|
||||
|
||||
- match: {hits.total.value: 2}
|
||||
- length: { aggregations.top_vals.buckets: 2 }
|
||||
|
||||
---
|
||||
"Sort works":
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
sort: [ { "my_wildcard": "desc" } ]
|
||||
|
||||
- match: { hits.total.value: 2 }
|
||||
- length: { hits.hits: 2 }
|
||||
- match: { hits.hits.0._id: "1" }
|
||||
- match: { hits.hits.1._id: "2" }
|
||||
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
track_total_hits: true
|
||||
sort: [ { "my_wildcard": "asc" } ]
|
||||
|
||||
- match: { hits.total.value: 2 }
|
||||
- length: { hits.hits: 2 }
|
||||
- match: { hits.hits.0._id: "2" }
|
||||
- match: { hits.hits.1._id: "1" }
|
||||
|
||||
|
18
x-pack/plugin/wildcard/build.gradle
Normal file
18
x-pack/plugin/wildcard/build.gradle
Normal file
@ -0,0 +1,18 @@
|
||||
evaluationDependsOn(xpackModule('core'))
|
||||
|
||||
apply plugin: 'elasticsearch.esplugin'
|
||||
|
||||
esplugin {
|
||||
name 'wildcard'
|
||||
description 'A plugin for a keyword field type with efficient wildcard search'
|
||||
classname 'org.elasticsearch.xpack.wildcard.Wildcard'
|
||||
extendedPlugins = ['x-pack-core']
|
||||
}
|
||||
archivesBaseName = 'x-pack-wildcard'
|
||||
|
||||
dependencies {
|
||||
compileOnly project(path: xpackModule('core'), configuration: 'default')
|
||||
testCompile project(path: xpackModule('core'), configuration: 'testArtifacts')
|
||||
}
|
||||
|
||||
integTest.enabled = false
|
@ -0,0 +1,31 @@
|
||||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.xpack.wildcard;
|
||||
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.mapper.Mapper;
|
||||
import org.elasticsearch.plugins.MapperPlugin;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class Wildcard extends Plugin implements MapperPlugin {
|
||||
|
||||
|
||||
public Wildcard(Settings settings) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Mapper.TypeParser> getMappers() {
|
||||
Map<String, Mapper.TypeParser> mappers = new LinkedHashMap<>();
|
||||
mappers.put(WildcardFieldMapper.CONTENT_TYPE, new WildcardFieldMapper.TypeParser());
|
||||
return Collections.unmodifiableMap(mappers);
|
||||
}
|
||||
}
|
@ -0,0 +1,104 @@
|
||||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.xpack.wildcard.mapper;
|
||||
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.ConstantScoreScorer;
|
||||
import org.apache.lucene.search.ConstantScoreWeight;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.TwoPhaseIterator;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Query that runs an Automaton across all binary doc values.
|
||||
* Expensive to run so normally used in conjunction with more selective query clauses.
|
||||
*/
|
||||
public class AutomatonQueryOnBinaryDv extends Query {
|
||||
|
||||
private final String field;
|
||||
private final String matchPattern;
|
||||
private final Automaton automaton;
|
||||
|
||||
public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton) {
|
||||
this.field = field;
|
||||
this.matchPattern = matchPattern;
|
||||
this.automaton = automaton;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
||||
|
||||
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
|
||||
|
||||
return new ConstantScoreWeight(this, boost) {
|
||||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
ByteArrayDataInput badi = new ByteArrayDataInput();
|
||||
final BinaryDocValues values = DocValues.getBinary(context.reader(), field);
|
||||
TwoPhaseIterator twoPhase = new TwoPhaseIterator(values) {
|
||||
@Override
|
||||
public boolean matches() throws IOException {
|
||||
BytesRef arrayOfValues = values.binaryValue();
|
||||
badi.reset(arrayOfValues.bytes);
|
||||
badi.setPosition(arrayOfValues.offset);
|
||||
|
||||
int size = badi.readVInt();
|
||||
for (int i=0; i< size; i++) {
|
||||
int valLength = badi.readVInt();
|
||||
if (bytesMatcher.run(arrayOfValues.bytes, badi.getPosition(), valLength)) {
|
||||
return true;
|
||||
}
|
||||
badi.skipBytes(valLength);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float matchCost() {
|
||||
// TODO: how can we compute this?
|
||||
return 1000f;
|
||||
}
|
||||
};
|
||||
return new ConstantScoreScorer(this, score(), scoreMode, twoPhase);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCacheable(LeafReaderContext ctx) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
}
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return field+":"+matchPattern;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
AutomatonQueryOnBinaryDv other = (AutomatonQueryOnBinaryDv) obj;
|
||||
return Objects.equals(field, other.field) && Objects.equals(matchPattern, other.matchPattern);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(field, matchPattern);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,575 @@
|
||||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
|
||||
package org.elasticsearch.xpack.wildcard.mapper;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.DocValuesFieldExistsQuery;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.elasticsearch.ElasticsearchParseException;
|
||||
import org.elasticsearch.common.lucene.BytesRefs;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.common.xcontent.support.XContentMapValues;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AnalyzerScope;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.fielddata.IndexFieldData;
|
||||
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
|
||||
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
|
||||
import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource;
|
||||
import org.elasticsearch.index.fielddata.plain.BytesBinaryDVIndexFieldData;
|
||||
import org.elasticsearch.index.mapper.BinaryFieldMapper.CustomBinaryDocValuesField;
|
||||
import org.elasticsearch.index.mapper.FieldMapper;
|
||||
import org.elasticsearch.index.mapper.MappedFieldType;
|
||||
import org.elasticsearch.index.mapper.Mapper;
|
||||
import org.elasticsearch.index.mapper.MapperParsingException;
|
||||
import org.elasticsearch.index.mapper.MapperService;
|
||||
import org.elasticsearch.index.mapper.ParseContext;
|
||||
import org.elasticsearch.index.mapper.ParseContext.Document;
|
||||
import org.elasticsearch.index.query.QueryShardContext;
|
||||
import org.elasticsearch.index.similarity.SimilarityProvider;
|
||||
import org.elasticsearch.indices.breaker.CircuitBreakerService;
|
||||
import org.elasticsearch.search.MultiValueMode;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.elasticsearch.index.mapper.TypeParsers.parseField;
|
||||
|
||||
/**
|
||||
* A {@link FieldMapper} for indexing fields with ngrams for efficient wildcard matching
|
||||
*/
|
||||
public class WildcardFieldMapper extends FieldMapper {
|
||||
|
||||
public static final String CONTENT_TYPE = "wildcard";
|
||||
public static short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10;
|
||||
public static final int NGRAM_SIZE = 3;
|
||||
static final NamedAnalyzer WILDCARD_ANALYZER = new NamedAnalyzer("_wildcard", AnalyzerScope.GLOBAL, new Analyzer() {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
});
|
||||
|
||||
public static class Defaults {
|
||||
public static final MappedFieldType FIELD_TYPE = new WildcardFieldType();
|
||||
|
||||
static {
|
||||
FIELD_TYPE.setTokenized(false);
|
||||
FIELD_TYPE.setIndexAnalyzer(WILDCARD_ANALYZER);
|
||||
FIELD_TYPE.setSearchAnalyzer(Lucene.KEYWORD_ANALYZER);
|
||||
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS);
|
||||
FIELD_TYPE.setStoreTermVectorOffsets(false);
|
||||
FIELD_TYPE.setOmitNorms(true);
|
||||
FIELD_TYPE.freeze();
|
||||
}
|
||||
public static final int IGNORE_ABOVE = Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
public static class Builder extends FieldMapper.Builder<Builder, WildcardFieldMapper> {
|
||||
protected int ignoreAbove = Defaults.IGNORE_ABOVE;
|
||||
|
||||
public Builder(String name) {
|
||||
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
|
||||
builder = this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Builder docValues(boolean docValues) {
|
||||
if (docValues == false) {
|
||||
throw new MapperParsingException("The field [" + name + "] cannot have doc values = false");
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Builder indexOptions(IndexOptions indexOptions) {
|
||||
if (indexOptions != IndexOptions.DOCS) {
|
||||
throw new MapperParsingException("The field [" + name + "] cannot have indexOptions = " + indexOptions);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Builder store(boolean store) {
|
||||
if (store) {
|
||||
throw new MapperParsingException("The field [" + name + "] cannot have store = true");
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Builder similarity(SimilarityProvider similarity) {
|
||||
throw new MapperParsingException("The field [" + name + "] cannot have custom similarities");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Builder index(boolean index) {
|
||||
if (index == false) {
|
||||
throw new MapperParsingException("The field [" + name + "] cannot have index = false");
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder ignoreAbove(int ignoreAbove) {
|
||||
if (ignoreAbove < 0) {
|
||||
throw new IllegalArgumentException("[ignore_above] must be positive, got " + ignoreAbove);
|
||||
}
|
||||
this.ignoreAbove = ignoreAbove;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected void setupFieldType(BuilderContext context) {
|
||||
super.setupFieldType(context);
|
||||
fieldType().setHasDocValues(true);
|
||||
fieldType().setTokenized(false);
|
||||
fieldType().setIndexOptions(IndexOptions.DOCS);
|
||||
}
|
||||
|
||||
@Override
|
||||
public WildcardFieldType fieldType() {
|
||||
return (WildcardFieldType) super.fieldType();
|
||||
}
|
||||
|
||||
@Override
|
||||
public WildcardFieldMapper build(BuilderContext context) {
|
||||
setupFieldType(context);
|
||||
return new WildcardFieldMapper(
|
||||
name, fieldType, defaultFieldType, ignoreAbove,
|
||||
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
|
||||
}
|
||||
}
|
||||
|
||||
public static class TypeParser implements Mapper.TypeParser {
|
||||
@Override
|
||||
public Mapper.Builder<?, ?> parse(String name, Map<String, Object> node, ParserContext parserContext)
|
||||
throws MapperParsingException {
|
||||
WildcardFieldMapper.Builder builder = new WildcardFieldMapper.Builder(name);
|
||||
parseField(builder, name, node, parserContext);
|
||||
|
||||
for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet().iterator(); iterator.hasNext();) {
|
||||
Map.Entry<String, Object> entry = iterator.next();
|
||||
String propName = entry.getKey();
|
||||
Object propNode = entry.getValue();
|
||||
if (propName.equals("ignore_above")) {
|
||||
builder.ignoreAbove(XContentMapValues.nodeIntegerValue(propNode, -1));
|
||||
iterator.remove();
|
||||
}
|
||||
}
|
||||
|
||||
return builder;
|
||||
}
|
||||
}
|
||||
|
||||
public static final char TOKEN_START_OR_END_CHAR = 0;
|
||||
|
||||
public static final class WildcardFieldType extends MappedFieldType {
|
||||
|
||||
public WildcardFieldType() {
|
||||
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
|
||||
setSearchAnalyzer(Lucene.KEYWORD_ANALYZER);
|
||||
}
|
||||
|
||||
protected WildcardFieldType(WildcardFieldType ref) {
|
||||
super(ref);
|
||||
}
|
||||
|
||||
public WildcardFieldType clone() {
|
||||
WildcardFieldType result = new WildcardFieldType(this);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// Holds parsed information about the wildcard pattern
|
||||
static class PatternStructure {
|
||||
boolean openStart, openEnd, hasSymbols;
|
||||
int lastGap =0;
|
||||
int wildcardCharCount, wildcardStringCount;
|
||||
String[] fragments;
|
||||
Integer [] precedingGapSizes;
|
||||
final String pattern;
|
||||
|
||||
@SuppressWarnings("fallthrough") // Intentionally uses fallthrough mirroring implementation in Lucene's WildcardQuery
|
||||
PatternStructure (String wildcardText) {
|
||||
this.pattern = wildcardText;
|
||||
ArrayList<String> fragmentList = new ArrayList<>();
|
||||
ArrayList<Integer> precedingGapSizeList = new ArrayList<>();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < wildcardText.length();) {
|
||||
final int c = wildcardText.codePointAt(i);
|
||||
int length = Character.charCount(c);
|
||||
switch (c) {
|
||||
case WildcardQuery.WILDCARD_STRING:
|
||||
if (i == 0) {
|
||||
openStart = true;
|
||||
}
|
||||
openEnd = true;
|
||||
hasSymbols = true;
|
||||
wildcardStringCount++;
|
||||
|
||||
if (sb.length() > 0) {
|
||||
precedingGapSizeList.add(lastGap);
|
||||
fragmentList.add(sb.toString());
|
||||
sb = new StringBuilder();
|
||||
}
|
||||
lastGap = Integer.MAX_VALUE;
|
||||
break;
|
||||
case WildcardQuery.WILDCARD_CHAR:
|
||||
if (i == 0) {
|
||||
openStart = true;
|
||||
}
|
||||
hasSymbols = true;
|
||||
wildcardCharCount++;
|
||||
openEnd = true;
|
||||
if (sb.length() > 0) {
|
||||
precedingGapSizeList.add(lastGap);
|
||||
fragmentList.add(sb.toString());
|
||||
sb = new StringBuilder();
|
||||
lastGap = 0;
|
||||
}
|
||||
|
||||
if (lastGap != Integer.MAX_VALUE) {
|
||||
lastGap++;
|
||||
}
|
||||
break;
|
||||
case WildcardQuery.WILDCARD_ESCAPE:
|
||||
// add the next codepoint instead, if it exists
|
||||
if (i + length < wildcardText.length()) {
|
||||
final int nextChar = wildcardText.codePointAt(i + length);
|
||||
length += Character.charCount(nextChar);
|
||||
sb.append(Character.toChars(nextChar));
|
||||
openEnd = false;
|
||||
break;
|
||||
} // else fallthru, lenient parsing with a trailing \
|
||||
default:
|
||||
openEnd = false;
|
||||
sb.append(Character.toChars(c));
|
||||
}
|
||||
i += length;
|
||||
}
|
||||
if (sb.length() > 0) {
|
||||
precedingGapSizeList.add(lastGap);
|
||||
fragmentList.add(sb.toString());
|
||||
lastGap = 0;
|
||||
}
|
||||
fragments = fragmentList.toArray(new String[0]);
|
||||
precedingGapSizes = precedingGapSizeList.toArray(new Integer[0]);
|
||||
|
||||
}
|
||||
|
||||
public boolean needsVerification() {
|
||||
// Return true if term queries are not enough evidence
|
||||
if (fragments.length == 1 && wildcardCharCount == 0) {
|
||||
// The one case where we don't need verification is when
|
||||
// we have a single fragment and no ? characters
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns number of positions for last gap (Integer.MAX means unlimited gap)
|
||||
public int getPrecedingGapSize(int fragmentNum) {
|
||||
return precedingGapSizes[fragmentNum];
|
||||
}
|
||||
|
||||
public boolean isMatchAll() {
|
||||
return fragments.length == 0 && wildcardStringCount >0 && wildcardCharCount ==0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return pattern.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
PatternStructure other = (PatternStructure) obj;
|
||||
return pattern.equals(other.pattern);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) {
|
||||
PatternStructure patternStructure = new PatternStructure(wildcardPattern);
|
||||
ArrayList<String> tokens = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < patternStructure.fragments.length; i++) {
|
||||
String fragment = patternStructure.fragments[i];
|
||||
int fLength = fragment.length();
|
||||
if (fLength == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add any start/end of string character
|
||||
if (i == 0 && patternStructure.openStart == false) {
|
||||
// Start-of-string anchored (is not a leading wildcard)
|
||||
fragment = TOKEN_START_OR_END_CHAR + fragment;
|
||||
}
|
||||
if (patternStructure.openEnd == false && i == patternStructure.fragments.length - 1) {
|
||||
// End-of-string anchored (is not a trailing wildcard)
|
||||
fragment = fragment + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
|
||||
}
|
||||
if (fragment.codePointCount(0, fragment.length()) <= NGRAM_SIZE) {
|
||||
tokens.add(fragment);
|
||||
} else {
|
||||
// Break fragment into multiple Ngrams
|
||||
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment);
|
||||
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
|
||||
String lastUnusedToken = null;
|
||||
try {
|
||||
tokenizer.reset();
|
||||
boolean takeThis = true;
|
||||
// minimise number of terms searched - eg for "12345" and 3grams we only need terms
|
||||
// `123` and `345` - no need to search for 234. We take every other ngram.
|
||||
while (tokenizer.incrementToken()) {
|
||||
String tokenValue = termAtt.toString();
|
||||
if (takeThis) {
|
||||
tokens.add(tokenValue);
|
||||
} else {
|
||||
lastUnusedToken = tokenValue;
|
||||
}
|
||||
// alternate
|
||||
takeThis = !takeThis;
|
||||
}
|
||||
if (lastUnusedToken != null) {
|
||||
// given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing
|
||||
// `ake` to complete the logic.
|
||||
tokens.add(lastUnusedToken);
|
||||
}
|
||||
tokenizer.end();
|
||||
tokenizer.close();
|
||||
} catch (IOException ioe) {
|
||||
throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment [" + fragment + "]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (patternStructure.isMatchAll()) {
|
||||
return new MatchAllDocsQuery();
|
||||
}
|
||||
BooleanQuery approximation = createApproximationQuery(tokens);
|
||||
if (approximation.clauses().size() > 1 || patternStructure.needsVerification()) {
|
||||
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
|
||||
verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST));
|
||||
Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
|
||||
verifyingBuilder.add(new BooleanClause(new AutomatonQueryOnBinaryDv(name(), wildcardPattern, automaton), Occur.MUST));
|
||||
return verifyingBuilder.build();
|
||||
}
|
||||
return approximation;
|
||||
}
|
||||
|
||||
private BooleanQuery createApproximationQuery(ArrayList<String> tokens) {
|
||||
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
|
||||
if (tokens.size() <= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
|
||||
for (String token : tokens) {
|
||||
addClause(token, bqBuilder);
|
||||
}
|
||||
return bqBuilder.build();
|
||||
}
|
||||
// Thin out the number of clauses using a selection spread evenly across the range
|
||||
float step = (float) (tokens.size() - 1) / (float) (MAX_CLAUSES_IN_APPROXIMATION_QUERY - 1); // set step size
|
||||
for (int i = 0; i < MAX_CLAUSES_IN_APPROXIMATION_QUERY; i++) {
|
||||
addClause(tokens.get(Math.round(step * i)), bqBuilder); // add each element of a position which is a multiple of step
|
||||
}
|
||||
// TODO we can be smarter about pruning here. e.g.
|
||||
// * Avoid wildcard queries if there are sufficient numbers of other terms that are full 3grams that are cheaper term queries
|
||||
// * We can select terms on their scarcity rather than even spreads across the search string.
|
||||
|
||||
return bqBuilder.build();
|
||||
}
|
||||
|
||||
private void addClause(String token, BooleanQuery.Builder bqBuilder) {
|
||||
assert token.codePointCount(0, token.length()) <= NGRAM_SIZE;
|
||||
if (token.codePointCount(0, token.length()) == NGRAM_SIZE) {
|
||||
TermQuery tq = new TermQuery(new Term(name(), token));
|
||||
bqBuilder.add(new BooleanClause(tq, Occur.MUST));
|
||||
} else {
|
||||
WildcardQuery wq = new WildcardQuery(new Term(name(), token + "*"));
|
||||
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
|
||||
bqBuilder.add(new BooleanClause(wq, Occur.MUST));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public String typeName() {
|
||||
return CONTENT_TYPE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query existsQuery(QueryShardContext context) {
|
||||
return new DocValuesFieldExistsQuery(name());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query termQuery(Object value, QueryShardContext context) {
|
||||
return wildcardQuery(BytesRefs.toString(value), MultiTermQuery.CONSTANT_SCORE_REWRITE, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
|
||||
return wildcardQuery(value + "*", method, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query termsQuery(List<?> values, QueryShardContext context) {
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
for (Object value : values) {
|
||||
bq.add(termQuery(value, context), Occur.SHOULD);
|
||||
}
|
||||
return new ConstantScoreQuery(bq.build());
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
|
||||
failIfNoDocValues();
|
||||
return new IndexFieldData.Builder() {
|
||||
|
||||
@Override
|
||||
public IndexFieldData<?> build(IndexSettings indexSettings, MappedFieldType fieldType, IndexFieldDataCache cache,
|
||||
CircuitBreakerService breakerService, MapperService mapperService) {
|
||||
return new WildcardBytesBinaryDVIndexFieldData(indexSettings.getIndex(), fieldType.name());
|
||||
}};
|
||||
}
|
||||
}
|
||||
|
||||
static class WildcardBytesBinaryDVIndexFieldData extends BytesBinaryDVIndexFieldData{
|
||||
|
||||
WildcardBytesBinaryDVIndexFieldData(Index index, String fieldName) {
|
||||
super(index, fieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortField sortField(Object missingValue, MultiValueMode sortMode, Nested nested, boolean reverse) {
|
||||
XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue,
|
||||
sortMode, nested);
|
||||
return new SortField(getFieldName(), source, reverse);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private int ignoreAbove;
|
||||
|
||||
private WildcardFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
|
||||
int ignoreAbove, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
|
||||
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
|
||||
this.ignoreAbove = ignoreAbove;
|
||||
assert fieldType.indexOptions() == IndexOptions.DOCS;
|
||||
|
||||
ngramFieldType = fieldType.clone();
|
||||
ngramFieldType.setTokenized(true);
|
||||
ngramFieldType.freeze();
|
||||
}
|
||||
|
||||
/** Values that have more chars than the return value of this method will
|
||||
* be skipped at parsing time. */
|
||||
// pkg-private for testing
|
||||
int ignoreAbove() {
|
||||
return ignoreAbove;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected WildcardFieldMapper clone() {
|
||||
return (WildcardFieldMapper) super.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public WildcardFieldType fieldType() {
|
||||
return (WildcardFieldType) super.fieldType();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
|
||||
super.doXContentBody(builder, includeDefaults, params);
|
||||
if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) {
|
||||
builder.field("ignore_above", ignoreAbove);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException {
|
||||
final String value;
|
||||
if (context.externalValueSet()) {
|
||||
value = context.externalValue().toString();
|
||||
} else {
|
||||
XContentParser parser = context.parser();
|
||||
if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
|
||||
value = fieldType().nullValueAsString();
|
||||
} else {
|
||||
value = parser.textOrNull();
|
||||
}
|
||||
}
|
||||
ParseContext.Document parseDoc = context.doc();
|
||||
|
||||
createFields(value, parseDoc, fields);
|
||||
}
|
||||
|
||||
// For internal use by Lucene only - used to define ngram index
|
||||
final MappedFieldType ngramFieldType;
|
||||
|
||||
void createFields(String value, Document parseDoc, List<IndexableField>fields) {
|
||||
if (value == null || value.length() > ignoreAbove) {
|
||||
return;
|
||||
}
|
||||
String ngramValue = TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
|
||||
Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType);
|
||||
fields.add(ngramField);
|
||||
|
||||
CustomBinaryDocValuesField dvField = (CustomBinaryDocValuesField) parseDoc.getByKey(fieldType().name());
|
||||
if (dvField == null) {
|
||||
dvField = new CustomBinaryDocValuesField(fieldType().name(), value.getBytes(StandardCharsets.UTF_8));
|
||||
parseDoc.addWithKey(fieldType().name(), dvField);
|
||||
} else {
|
||||
dvField.add(value.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String contentType() {
|
||||
return CONTENT_TYPE;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected void doMerge(Mapper mergeWith) {
|
||||
super.doMerge(mergeWith);
|
||||
this.ignoreAbove = ((WildcardFieldMapper) mergeWith).ignoreAbove;
|
||||
}
|
||||
}
|
@ -0,0 +1,331 @@
|
||||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.xpack.wildcard.mapper;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.util.BigArrays;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.cache.bitset.BitsetFilterCache;
|
||||
import org.elasticsearch.index.fielddata.IndexFieldData;
|
||||
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
|
||||
import org.elasticsearch.index.mapper.ContentPath;
|
||||
import org.elasticsearch.index.mapper.KeywordFieldMapper;
|
||||
import org.elasticsearch.index.mapper.MappedFieldType;
|
||||
import org.elasticsearch.index.mapper.Mapper;
|
||||
import org.elasticsearch.index.mapper.MapperParsingException;
|
||||
import org.elasticsearch.index.mapper.ParseContext;
|
||||
import org.elasticsearch.index.query.QueryShardContext;
|
||||
import org.elasticsearch.search.sort.FieldSortBuilder;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.IndexSettingsModule;
|
||||
import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper.Builder;
|
||||
import org.junit.Before;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.function.BiFunction;
|
||||
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
||||
public class WildcardFieldMapperTests extends ESTestCase {
|
||||
|
||||
private static final String KEYWORD_FIELD_NAME = "keyword_field";
|
||||
private static final String WILDCARD_FIELD_NAME = "wildcard_field";
|
||||
static final int MAX_FIELD_LENGTH = 100;
|
||||
static WildcardFieldMapper wildcardFieldType;
|
||||
static KeywordFieldMapper keywordFieldType;
|
||||
|
||||
@Override
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
Builder builder = new WildcardFieldMapper.Builder(WILDCARD_FIELD_NAME);
|
||||
builder.ignoreAbove(MAX_FIELD_LENGTH);
|
||||
wildcardFieldType = builder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0)));
|
||||
|
||||
|
||||
org.elasticsearch.index.mapper.KeywordFieldMapper.Builder kwBuilder = new KeywordFieldMapper.Builder(KEYWORD_FIELD_NAME);
|
||||
keywordFieldType = kwBuilder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0)));
|
||||
super.setUp();
|
||||
}
|
||||
|
||||
public void testIllegalDocValuesArgument() {
|
||||
Builder ft = new WildcardFieldMapper.Builder("test");
|
||||
MapperParsingException e = expectThrows(MapperParsingException.class,
|
||||
() -> ft.docValues(false));
|
||||
assertEquals("The field [test] cannot have doc values = false", e.getMessage());
|
||||
}
|
||||
|
||||
public void testIllegalIndexedArgument() {
|
||||
Builder ft = new WildcardFieldMapper.Builder("test");
|
||||
MapperParsingException e = expectThrows(MapperParsingException.class,
|
||||
() -> ft.index(false));
|
||||
assertEquals("The field [test] cannot have index = false", e.getMessage());
|
||||
}
|
||||
|
||||
public void testTooBigKeywordField() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
|
||||
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
// Create a string that is too large and will not be indexed
|
||||
String docContent = randomABString(MAX_FIELD_LENGTH + 1);
|
||||
Document doc = new Document();
|
||||
ParseContext.Document parseDoc = new ParseContext.Document();
|
||||
addFields(parseDoc, doc, docContent);
|
||||
indexDoc(parseDoc, doc, iw);
|
||||
|
||||
iw.forceMerge(1);
|
||||
DirectoryReader reader = iw.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
iw.close();
|
||||
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery("*a*", null, null);
|
||||
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
|
||||
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
//Test long query strings don't cause exceptions
|
||||
public void testTooBigQueryField() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
|
||||
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
// Create a string that is too large and will not be indexed
|
||||
String docContent = randomABString(10);
|
||||
Document doc = new Document();
|
||||
ParseContext.Document parseDoc = new ParseContext.Document();
|
||||
addFields(parseDoc, doc, docContent);
|
||||
indexDoc(parseDoc, doc, iw);
|
||||
|
||||
iw.forceMerge(1);
|
||||
DirectoryReader reader = iw.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
iw.close();
|
||||
|
||||
String queryString = randomABString((BooleanQuery.getMaxClauseCount() * 2) + 1);
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(queryString, null, null);
|
||||
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
|
||||
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
||||
public void testSearchResultsVersusKeywordField() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
|
||||
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
int numDocs = 100;
|
||||
HashSet<String> values = new HashSet<>();
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document doc = new Document();
|
||||
ParseContext.Document parseDoc = new ParseContext.Document();
|
||||
String docContent = randomABString(1 + randomInt(MAX_FIELD_LENGTH - 1));
|
||||
if (values.contains(docContent) == false) {
|
||||
addFields(parseDoc, doc, docContent);
|
||||
values.add(docContent);
|
||||
}
|
||||
// Occasionally add a multi-value field
|
||||
if (randomBoolean()) {
|
||||
docContent = randomABString(1 + randomInt(MAX_FIELD_LENGTH - 1));
|
||||
if (values.contains(docContent) == false) {
|
||||
addFields(parseDoc, doc, docContent);
|
||||
values.add(docContent);
|
||||
}
|
||||
}
|
||||
indexDoc(parseDoc, doc, iw);
|
||||
|
||||
}
|
||||
|
||||
iw.forceMerge(1);
|
||||
DirectoryReader reader = iw.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
iw.close();
|
||||
|
||||
int numSearches = 100;
|
||||
for (int i = 0; i < numSearches; i++) {
|
||||
String randomWildcardPattern = getRandomWildcardPattern();
|
||||
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(randomWildcardPattern, null, null);
|
||||
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.INDEXORDER);
|
||||
|
||||
Query keywordFieldQuery = new WildcardQuery(new Term(KEYWORD_FIELD_NAME, randomWildcardPattern));
|
||||
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.INDEXORDER);
|
||||
|
||||
assertThat(kwTopDocs.totalHits.value, equalTo(wildcardFieldTopDocs.totalHits.value));
|
||||
|
||||
HashSet<Integer> expectedDocs = new HashSet<>();
|
||||
for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
|
||||
expectedDocs.add(topDoc.doc);
|
||||
}
|
||||
for (ScoreDoc wcTopDoc : wildcardFieldTopDocs.scoreDocs) {
|
||||
assertTrue(expectedDocs.remove(wcTopDoc.doc));
|
||||
}
|
||||
assertThat(expectedDocs.size(), equalTo(0));
|
||||
}
|
||||
|
||||
|
||||
//Test keyword and wildcard sort operations are also equivalent
|
||||
QueryShardContext shardContextMock = createMockShardContext();
|
||||
|
||||
FieldSortBuilder wildcardSortBuilder = new FieldSortBuilder(WILDCARD_FIELD_NAME);
|
||||
SortField wildcardSortField = wildcardSortBuilder.build(shardContextMock).field;
|
||||
ScoreDoc[] wildcardHits = searcher.search(new MatchAllDocsQuery(), numDocs, new Sort(wildcardSortField)).scoreDocs;
|
||||
|
||||
FieldSortBuilder keywordSortBuilder = new FieldSortBuilder(KEYWORD_FIELD_NAME);
|
||||
SortField keywordSortField = keywordSortBuilder.build(shardContextMock).field;
|
||||
ScoreDoc[] keywordHits = searcher.search(new MatchAllDocsQuery(), numDocs, new Sort(keywordSortField)).scoreDocs;
|
||||
|
||||
assertThat(wildcardHits.length, equalTo(keywordHits.length));
|
||||
for (int i = 0; i < wildcardHits.length; i++) {
|
||||
assertThat(wildcardHits[i].doc, equalTo(keywordHits[i].doc));
|
||||
}
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected MappedFieldType provideMappedFieldType(String name) {
|
||||
if (name.equals(WILDCARD_FIELD_NAME)) {
|
||||
return wildcardFieldType.fieldType();
|
||||
} else {
|
||||
return keywordFieldType.fieldType();
|
||||
}
|
||||
}
|
||||
|
||||
protected final QueryShardContext createMockShardContext() {
|
||||
Index index = new Index(randomAlphaOfLengthBetween(1, 10), "_na_");
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings(index,
|
||||
Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build());
|
||||
BitsetFilterCache bitsetFilterCache = new BitsetFilterCache(idxSettings, Mockito.mock(BitsetFilterCache.Listener.class));
|
||||
BiFunction<MappedFieldType, String, IndexFieldData<?>> indexFieldDataLookup = (fieldType, fieldIndexName) -> {
|
||||
IndexFieldData.Builder builder = fieldType.fielddataBuilder(fieldIndexName);
|
||||
return builder.build(idxSettings, fieldType, new IndexFieldDataCache.None(), null, null);
|
||||
};
|
||||
return new QueryShardContext(0, idxSettings, BigArrays.NON_RECYCLING_INSTANCE, bitsetFilterCache, indexFieldDataLookup,
|
||||
null, null, null, xContentRegistry(), null, null, null,
|
||||
() -> randomNonNegativeLong(), null, null, () -> true) {
|
||||
|
||||
@Override
|
||||
public MappedFieldType fieldMapper(String name) {
|
||||
return provideMappedFieldType(name);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private void addFields(ParseContext.Document parseDoc, Document doc, String docContent) throws IOException {
|
||||
ArrayList<IndexableField> fields = new ArrayList<>();
|
||||
wildcardFieldType.createFields(docContent, parseDoc, fields);
|
||||
|
||||
for (IndexableField indexableField : fields) {
|
||||
doc.add(indexableField);
|
||||
}
|
||||
// Add keyword fields too
|
||||
doc.add(new SortedSetDocValuesField(KEYWORD_FIELD_NAME, new BytesRef(docContent)));
|
||||
doc.add(new StringField(KEYWORD_FIELD_NAME, docContent, Field.Store.YES));
|
||||
}
|
||||
|
||||
private void indexDoc(ParseContext.Document parseDoc, Document doc, RandomIndexWriter iw) throws IOException {
|
||||
IndexableField field = parseDoc.getByKey(wildcardFieldType.name());
|
||||
if (field != null) {
|
||||
doc.add(field);
|
||||
}
|
||||
iw.addDocument(doc);
|
||||
}
|
||||
|
||||
protected IndexSettings createIndexSettings() {
|
||||
return new IndexSettings(
|
||||
IndexMetaData.builder("_index").settings(Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT))
|
||||
.numberOfShards(1).numberOfReplicas(0).creationDate(System.currentTimeMillis()).build(),
|
||||
Settings.EMPTY);
|
||||
}
|
||||
|
||||
|
||||
static String randomABString(int minLength) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
while (sb.length() < minLength) {
|
||||
if (randomBoolean()) {
|
||||
sb.append("a");
|
||||
} else {
|
||||
sb.append("b");
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private void randomSyntaxChar(StringBuilder sb) {
|
||||
switch (randomInt(3)) {
|
||||
case 0:
|
||||
sb.append(WildcardQuery.WILDCARD_CHAR);
|
||||
break;
|
||||
case 1:
|
||||
sb.append(WildcardQuery.WILDCARD_STRING);
|
||||
break;
|
||||
case 2:
|
||||
sb.append(WildcardQuery.WILDCARD_ESCAPE);
|
||||
sb.append(WildcardQuery.WILDCARD_STRING);
|
||||
break;
|
||||
case 3:
|
||||
sb.append(WildcardQuery.WILDCARD_ESCAPE);
|
||||
sb.append(WildcardQuery.WILDCARD_CHAR);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private String getRandomWildcardPattern() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int numFragments = 1 + randomInt(4);
|
||||
if (randomInt(10) == 1) {
|
||||
randomSyntaxChar(sb);
|
||||
}
|
||||
for (int i = 0; i < numFragments; i++) {
|
||||
if (i > 0) {
|
||||
randomSyntaxChar(sb);
|
||||
}
|
||||
sb.append(randomABString(1 + randomInt(6)));
|
||||
}
|
||||
if (randomInt(10) == 1) {
|
||||
randomSyntaxChar(sb);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
@ -0,0 +1,19 @@
|
||||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
|
||||
package org.elasticsearch.xpack.wildcard.mapper;
|
||||
|
||||
import org.elasticsearch.index.mapper.FieldTypeTestCase;
|
||||
import org.elasticsearch.index.mapper.MappedFieldType;
|
||||
|
||||
public class WildcardFieldTypeTests extends FieldTypeTestCase {
|
||||
|
||||
@Override
|
||||
protected MappedFieldType createDefaultFieldType() {
|
||||
return new WildcardFieldMapper.WildcardFieldType();
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user