From 2c74f3e22ce8cc8407cc3d38fc41a17a9a24f1fe Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 16 Mar 2020 15:07:13 +0000 Subject: [PATCH] Backport of new wildcard field type (#53590) * New wildcard field optimised for wildcard queries (#49993) Indexes values using size 3 ngrams and also stores the full original as a binary doc value. Wildcard queries operate by using a cheap approximation query on the ngram field followed up by a more expensive verification query using an automaton on the binary doc values. Also supports aggregations and sorting. --- docs/reference/mapping/types.asciidoc | 4 +- .../reference/mapping/types/wildcard.asciidoc | 55 ++ .../plain/BinaryDVIndexFieldData.java | 17 +- .../license/XPackLicenseState.java | 10 + .../test/wildcard/10_wildcard_basic.yml | 218 +++++++ x-pack/plugin/wildcard/build.gradle | 18 + .../xpack/wildcard/Wildcard.java | 31 + .../mapper/AutomatonQueryOnBinaryDv.java | 104 ++++ .../wildcard/mapper/WildcardFieldMapper.java | 575 ++++++++++++++++++ .../mapper/WildcardFieldMapperTests.java | 331 ++++++++++ .../mapper/WildcardFieldTypeTests.java | 19 + 11 files changed, 1365 insertions(+), 17 deletions(-) create mode 100644 docs/reference/mapping/types/wildcard.asciidoc create mode 100644 x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml create mode 100644 x-pack/plugin/wildcard/build.gradle create mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/Wildcard.java create mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java create mode 100644 x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java create mode 100644 x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java create mode 100644 x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc index 3345ebcbb31..974ce5434ac 100644 --- a/docs/reference/mapping/types.asciidoc +++ b/docs/reference/mapping/types.asciidoc @@ -7,7 +7,7 @@ document: [float] === Core datatypes -string:: <> and <> +string:: <>, <> and <> <>:: `long`, `integer`, `short`, `byte`, `double`, `float`, `half_float`, `scaled_float` <>:: `date` <>:: `date_nanos` @@ -135,3 +135,5 @@ include::types/token-count.asciidoc[] include::types/shape.asciidoc[] include::types/constant-keyword.asciidoc[] + +include::types/wildcard.asciidoc[] diff --git a/docs/reference/mapping/types/wildcard.asciidoc b/docs/reference/mapping/types/wildcard.asciidoc new file mode 100644 index 00000000000..ab13bb41afe --- /dev/null +++ b/docs/reference/mapping/types/wildcard.asciidoc @@ -0,0 +1,55 @@ +[role="xpack"] +[testenv="basic"] +[[wildcard]] +=== Wildcard datatype +++++ +Wildcard +++++ + +A `wildcard` field stores values optimised for wildcard grep-like queries. +Wildcard queries are possible on other field types but suffer from constraints: +* `text` fields limit matching of any wildcard expressions to individual tokens rather than the original whole value held in a field +* `keyword` fields are untokenized but slow at performing wildcard queries (especially patterns with leading wildcards). + +Internally the `wildcard` field indexes the whole field value using ngrams and stores the full string. +The index is used as a rough filter to cut down the number of values that are then checked by retrieving and checking the full values. +This field is especially well suited to run grep-like queries on log lines. Storage costs are typically lower than those of `keyword` +fields but search speeds for exact matches on full terms are slower. + +You index and search a wildcard field as follows + +[source,console] +-------------------------------------------------- +PUT my_index +{ + "mappings": { + "properties": { + "my_wildcard": { + "type": "wildcard" + } + } + } +} + +PUT my_index/_doc/1 +{ + "my_wildcard" : "This string can be quite lengthy" +} + +POST my_index/_doc/_search +{ + "query": { + "wildcard" : { + "value": "*quite*lengthy" + } + } +} + + +-------------------------------------------------- + + +==== Limitations + +* `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries. + diff --git a/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java b/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java index 27088382f20..06352640dc1 100644 --- a/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java +++ b/server/src/main/java/org/elasticsearch/index/fielddata/plain/BinaryDVIndexFieldData.java @@ -21,8 +21,6 @@ package org.elasticsearch.index.fielddata.plain; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.SortField; -import org.apache.lucene.search.SortedSetSortField; -import org.apache.lucene.search.SortedSetSelector; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.index.Index; @@ -54,20 +52,7 @@ public class BinaryDVIndexFieldData extends DocValuesIndexFieldData implements I public SortField sortField(@Nullable Object missingValue, MultiValueMode sortMode, XFieldComparatorSource.Nested nested, boolean reverse) { XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue, sortMode, nested); - /** - * Check if we can use a simple {@link SortedSetSortField} compatible with index sorting and - * returns a custom sort field otherwise. - */ - if (nested != null || - (sortMode != MultiValueMode.MAX && sortMode != MultiValueMode.MIN) || - (source.sortMissingFirst(missingValue) == false && source.sortMissingLast(missingValue) == false)) { - return new SortField(getFieldName(), source, reverse); - } - SortField sortField = new SortedSetSortField(fieldName, reverse, - sortMode == MultiValueMode.MAX ? SortedSetSelector.Type.MAX : SortedSetSelector.Type.MIN); - sortField.setMissingValue(source.sortMissingLast(missingValue) ^ reverse ? - SortedSetSortField.STRING_LAST : SortedSetSortField.STRING_FIRST); - return sortField; + return new SortField(getFieldName(), source, reverse); } @Override diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/license/XPackLicenseState.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/license/XPackLicenseState.java index 6e7a0c376ff..65cca6b81b5 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/license/XPackLicenseState.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/license/XPackLicenseState.java @@ -613,6 +613,16 @@ public class XPackLicenseState { public boolean isVectorsAllowed() { return allowForAllLicenses(); } + + + /** + * Determine if Wildcard support should be enabled. + *

+ * Wildcard is available for all license types except {@link OperationMode#MISSING} + */ + public synchronized boolean isWildcardAllowed() { + return status.active; + } public boolean isOdbcAllowed() { return isAllowedByLicense(OperationMode.PLATINUM); diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml new file mode 100644 index 00000000000..bdec75fc548 --- /dev/null +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml @@ -0,0 +1,218 @@ +setup: + - skip: + features: headers + version: " - 7.6.99" + reason: "wildcard fields were added from 7.7" + + - do: + indices.create: + index: test-index + body: + settings: + number_of_replicas: 0 + mappings: + properties: + my_wildcard: + type: wildcard + - do: + index: + index: test-index + id: 1 + body: + my_wildcard: hello world + - do: + index: + index: test-index + id: 2 + body: + my_wildcard: goodbye world + + - do: + indices.refresh: {} + +--- +"Short prefix query": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard: {value: "hel*" } + + + - match: {hits.total.value: 1} + +--- +"Long prefix query": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard: {value: "hello wor*" } + + + - match: {hits.total.value: 1} + +--- +"Short unrooted query": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard: {value: "*ello*" } + + + - match: {hits.total.value: 1} + +--- +"Long unrooted query": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard: {value: "*ello worl*" } + + + - match: {hits.total.value: 1} + +--- +"Short suffix query": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard: {value: "*ld" } + + + - match: {hits.total.value: 2} + +--- +"Long suffix query": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard: {value: "*ello world" } + + + - match: {hits.total.value: 1} + +--- +"No wildcard wildcard query": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard: {value: "hello world" } + + + - match: {hits.total.value: 1} + +--- +"Term query on wildcard field": + - do: + search: + body: + track_total_hits: true + query: + term: + my_wildcard: "hello world" + + + - match: {hits.total.value: 1} + +--- +"Terms query on wildcard field": + - do: + search: + body: + track_total_hits: true + query: + terms: + my_wildcard: ["hello world", "does not exist"] + + + - match: {hits.total.value: 1} + +--- +"Prefix query on wildcard field": + - do: + search: + body: + track_total_hits: true + query: + prefix: + my_wildcard: + value: "hell*" + + + - match: {hits.total.value: 1} + +--- +"Sequence fail": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard: {value: "*world*hello*" } + + + - match: {hits.total.value: 0} + +--- +"Aggs work": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard: {value: "*world*" } + aggs: + top_vals: + terms: {field: "my_wildcard" } + + + - match: {hits.total.value: 2} + - length: { aggregations.top_vals.buckets: 2 } + +--- +"Sort works": + - do: + search: + body: + track_total_hits: true + sort: [ { "my_wildcard": "desc" } ] + + - match: { hits.total.value: 2 } + - length: { hits.hits: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "2" } + + - do: + search: + body: + track_total_hits: true + sort: [ { "my_wildcard": "asc" } ] + + - match: { hits.total.value: 2 } + - length: { hits.hits: 2 } + - match: { hits.hits.0._id: "2" } + - match: { hits.hits.1._id: "1" } + + diff --git a/x-pack/plugin/wildcard/build.gradle b/x-pack/plugin/wildcard/build.gradle new file mode 100644 index 00000000000..22e29aa0832 --- /dev/null +++ b/x-pack/plugin/wildcard/build.gradle @@ -0,0 +1,18 @@ +evaluationDependsOn(xpackModule('core')) + +apply plugin: 'elasticsearch.esplugin' + +esplugin { + name 'wildcard' + description 'A plugin for a keyword field type with efficient wildcard search' + classname 'org.elasticsearch.xpack.wildcard.Wildcard' + extendedPlugins = ['x-pack-core'] +} +archivesBaseName = 'x-pack-wildcard' + +dependencies { + compileOnly project(path: xpackModule('core'), configuration: 'default') + testCompile project(path: xpackModule('core'), configuration: 'testArtifacts') +} + +integTest.enabled = false diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/Wildcard.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/Wildcard.java new file mode 100644 index 00000000000..3749dc2622c --- /dev/null +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/Wildcard.java @@ -0,0 +1,31 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.wildcard; + +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.mapper.Mapper; +import org.elasticsearch.plugins.MapperPlugin; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper; + +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; + +public class Wildcard extends Plugin implements MapperPlugin { + + + public Wildcard(Settings settings) { + } + + @Override + public Map getMappers() { + Map mappers = new LinkedHashMap<>(); + mappers.put(WildcardFieldMapper.CONTENT_TYPE, new WildcardFieldMapper.TypeParser()); + return Collections.unmodifiableMap(mappers); + } +} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java new file mode 100644 index 00000000000..648fbc7e0cd --- /dev/null +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/AutomatonQueryOnBinaryDv.java @@ -0,0 +1,104 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.wildcard.mapper; + +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.ConstantScoreScorer; +import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.Weight; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.ByteRunAutomaton; + +import java.io.IOException; +import java.util.Objects; + +/** + * Query that runs an Automaton across all binary doc values. + * Expensive to run so normally used in conjunction with more selective query clauses. + */ +public class AutomatonQueryOnBinaryDv extends Query { + + private final String field; + private final String matchPattern; + private final Automaton automaton; + + public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton) { + this.field = field; + this.matchPattern = matchPattern; + this.automaton = automaton; + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + + ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton); + + return new ConstantScoreWeight(this, boost) { + + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { + ByteArrayDataInput badi = new ByteArrayDataInput(); + final BinaryDocValues values = DocValues.getBinary(context.reader(), field); + TwoPhaseIterator twoPhase = new TwoPhaseIterator(values) { + @Override + public boolean matches() throws IOException { + BytesRef arrayOfValues = values.binaryValue(); + badi.reset(arrayOfValues.bytes); + badi.setPosition(arrayOfValues.offset); + + int size = badi.readVInt(); + for (int i=0; i< size; i++) { + int valLength = badi.readVInt(); + if (bytesMatcher.run(arrayOfValues.bytes, badi.getPosition(), valLength)) { + return true; + } + badi.skipBytes(valLength); + } + return false; + } + + @Override + public float matchCost() { + // TODO: how can we compute this? + return 1000f; + } + }; + return new ConstantScoreScorer(this, score(), scoreMode, twoPhase); + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return true; + } + }; + } + @Override + public String toString(String field) { + return field+":"+matchPattern; + } + + @Override + public boolean equals(Object obj) { + AutomatonQueryOnBinaryDv other = (AutomatonQueryOnBinaryDv) obj; + return Objects.equals(field, other.field) && Objects.equals(matchPattern, other.matchPattern); + } + + @Override + public int hashCode() { + return Objects.hash(field, matchPattern); + } + +} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java new file mode 100644 index 00000000000..e489d8a35bb --- /dev/null +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -0,0 +1,575 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + + +package org.elasticsearch.xpack.wildcard.mapper; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.DocValuesFieldExistsQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.MultiTermQuery.RewriteMethod; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.automaton.Automaton; +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.common.lucene.BytesRefs; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AnalyzerScope; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested; +import org.elasticsearch.index.fielddata.IndexFieldDataCache; +import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource; +import org.elasticsearch.index.fielddata.plain.BytesBinaryDVIndexFieldData; +import org.elasticsearch.index.mapper.BinaryFieldMapper.CustomBinaryDocValuesField; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.Mapper; +import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.ParseContext; +import org.elasticsearch.index.mapper.ParseContext.Document; +import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.index.similarity.SimilarityProvider; +import org.elasticsearch.indices.breaker.CircuitBreakerService; +import org.elasticsearch.search.MultiValueMode; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.index.mapper.TypeParsers.parseField; + +/** + * A {@link FieldMapper} for indexing fields with ngrams for efficient wildcard matching + */ +public class WildcardFieldMapper extends FieldMapper { + + public static final String CONTENT_TYPE = "wildcard"; + public static short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10; + public static final int NGRAM_SIZE = 3; + static final NamedAnalyzer WILDCARD_ANALYZER = new NamedAnalyzer("_wildcard", AnalyzerScope.GLOBAL, new Analyzer() { + @Override + public TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new NGramTokenizer(NGRAM_SIZE, NGRAM_SIZE); + return new TokenStreamComponents(tokenizer); + } + }); + + public static class Defaults { + public static final MappedFieldType FIELD_TYPE = new WildcardFieldType(); + + static { + FIELD_TYPE.setTokenized(false); + FIELD_TYPE.setIndexAnalyzer(WILDCARD_ANALYZER); + FIELD_TYPE.setSearchAnalyzer(Lucene.KEYWORD_ANALYZER); + FIELD_TYPE.setIndexOptions(IndexOptions.DOCS); + FIELD_TYPE.setStoreTermVectorOffsets(false); + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.freeze(); + } + public static final int IGNORE_ABOVE = Integer.MAX_VALUE; + } + + public static class Builder extends FieldMapper.Builder { + protected int ignoreAbove = Defaults.IGNORE_ABOVE; + + public Builder(String name) { + super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); + builder = this; + } + + @Override + public Builder docValues(boolean docValues) { + if (docValues == false) { + throw new MapperParsingException("The field [" + name + "] cannot have doc values = false"); + } + return this; + } + + @Override + public Builder indexOptions(IndexOptions indexOptions) { + if (indexOptions != IndexOptions.DOCS) { + throw new MapperParsingException("The field [" + name + "] cannot have indexOptions = " + indexOptions); + } + return this; + } + + @Override + public Builder store(boolean store) { + if (store) { + throw new MapperParsingException("The field [" + name + "] cannot have store = true"); + } + return this; + } + + @Override + public Builder similarity(SimilarityProvider similarity) { + throw new MapperParsingException("The field [" + name + "] cannot have custom similarities"); + } + + @Override + public Builder index(boolean index) { + if (index == false) { + throw new MapperParsingException("The field [" + name + "] cannot have index = false"); + } + return this; + } + + public Builder ignoreAbove(int ignoreAbove) { + if (ignoreAbove < 0) { + throw new IllegalArgumentException("[ignore_above] must be positive, got " + ignoreAbove); + } + this.ignoreAbove = ignoreAbove; + return this; + } + + + @Override + protected void setupFieldType(BuilderContext context) { + super.setupFieldType(context); + fieldType().setHasDocValues(true); + fieldType().setTokenized(false); + fieldType().setIndexOptions(IndexOptions.DOCS); + } + + @Override + public WildcardFieldType fieldType() { + return (WildcardFieldType) super.fieldType(); + } + + @Override + public WildcardFieldMapper build(BuilderContext context) { + setupFieldType(context); + return new WildcardFieldMapper( + name, fieldType, defaultFieldType, ignoreAbove, + context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo); + } + } + + public static class TypeParser implements Mapper.TypeParser { + @Override + public Mapper.Builder parse(String name, Map node, ParserContext parserContext) + throws MapperParsingException { + WildcardFieldMapper.Builder builder = new WildcardFieldMapper.Builder(name); + parseField(builder, name, node, parserContext); + + for (Iterator> iterator = node.entrySet().iterator(); iterator.hasNext();) { + Map.Entry entry = iterator.next(); + String propName = entry.getKey(); + Object propNode = entry.getValue(); + if (propName.equals("ignore_above")) { + builder.ignoreAbove(XContentMapValues.nodeIntegerValue(propNode, -1)); + iterator.remove(); + } + } + + return builder; + } + } + + public static final char TOKEN_START_OR_END_CHAR = 0; + + public static final class WildcardFieldType extends MappedFieldType { + + public WildcardFieldType() { + setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); + setSearchAnalyzer(Lucene.KEYWORD_ANALYZER); + } + + protected WildcardFieldType(WildcardFieldType ref) { + super(ref); + } + + public WildcardFieldType clone() { + WildcardFieldType result = new WildcardFieldType(this); + return result; + } + + + // Holds parsed information about the wildcard pattern + static class PatternStructure { + boolean openStart, openEnd, hasSymbols; + int lastGap =0; + int wildcardCharCount, wildcardStringCount; + String[] fragments; + Integer [] precedingGapSizes; + final String pattern; + + @SuppressWarnings("fallthrough") // Intentionally uses fallthrough mirroring implementation in Lucene's WildcardQuery + PatternStructure (String wildcardText) { + this.pattern = wildcardText; + ArrayList fragmentList = new ArrayList<>(); + ArrayList precedingGapSizeList = new ArrayList<>(); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < wildcardText.length();) { + final int c = wildcardText.codePointAt(i); + int length = Character.charCount(c); + switch (c) { + case WildcardQuery.WILDCARD_STRING: + if (i == 0) { + openStart = true; + } + openEnd = true; + hasSymbols = true; + wildcardStringCount++; + + if (sb.length() > 0) { + precedingGapSizeList.add(lastGap); + fragmentList.add(sb.toString()); + sb = new StringBuilder(); + } + lastGap = Integer.MAX_VALUE; + break; + case WildcardQuery.WILDCARD_CHAR: + if (i == 0) { + openStart = true; + } + hasSymbols = true; + wildcardCharCount++; + openEnd = true; + if (sb.length() > 0) { + precedingGapSizeList.add(lastGap); + fragmentList.add(sb.toString()); + sb = new StringBuilder(); + lastGap = 0; + } + + if (lastGap != Integer.MAX_VALUE) { + lastGap++; + } + break; + case WildcardQuery.WILDCARD_ESCAPE: + // add the next codepoint instead, if it exists + if (i + length < wildcardText.length()) { + final int nextChar = wildcardText.codePointAt(i + length); + length += Character.charCount(nextChar); + sb.append(Character.toChars(nextChar)); + openEnd = false; + break; + } // else fallthru, lenient parsing with a trailing \ + default: + openEnd = false; + sb.append(Character.toChars(c)); + } + i += length; + } + if (sb.length() > 0) { + precedingGapSizeList.add(lastGap); + fragmentList.add(sb.toString()); + lastGap = 0; + } + fragments = fragmentList.toArray(new String[0]); + precedingGapSizes = precedingGapSizeList.toArray(new Integer[0]); + + } + + public boolean needsVerification() { + // Return true if term queries are not enough evidence + if (fragments.length == 1 && wildcardCharCount == 0) { + // The one case where we don't need verification is when + // we have a single fragment and no ? characters + return false; + } + return true; + } + + // Returns number of positions for last gap (Integer.MAX means unlimited gap) + public int getPrecedingGapSize(int fragmentNum) { + return precedingGapSizes[fragmentNum]; + } + + public boolean isMatchAll() { + return fragments.length == 0 && wildcardStringCount >0 && wildcardCharCount ==0; + } + + @Override + public int hashCode() { + return pattern.hashCode(); + } + + @Override + public boolean equals(Object obj) { + PatternStructure other = (PatternStructure) obj; + return pattern.equals(other.pattern); + } + + + } + + + @Override + public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) { + PatternStructure patternStructure = new PatternStructure(wildcardPattern); + ArrayList tokens = new ArrayList<>(); + + for (int i = 0; i < patternStructure.fragments.length; i++) { + String fragment = patternStructure.fragments[i]; + int fLength = fragment.length(); + if (fLength == 0) { + continue; + } + + // Add any start/end of string character + if (i == 0 && patternStructure.openStart == false) { + // Start-of-string anchored (is not a leading wildcard) + fragment = TOKEN_START_OR_END_CHAR + fragment; + } + if (patternStructure.openEnd == false && i == patternStructure.fragments.length - 1) { + // End-of-string anchored (is not a trailing wildcard) + fragment = fragment + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR; + } + if (fragment.codePointCount(0, fragment.length()) <= NGRAM_SIZE) { + tokens.add(fragment); + } else { + // Break fragment into multiple Ngrams + TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment); + CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); + String lastUnusedToken = null; + try { + tokenizer.reset(); + boolean takeThis = true; + // minimise number of terms searched - eg for "12345" and 3grams we only need terms + // `123` and `345` - no need to search for 234. We take every other ngram. + while (tokenizer.incrementToken()) { + String tokenValue = termAtt.toString(); + if (takeThis) { + tokens.add(tokenValue); + } else { + lastUnusedToken = tokenValue; + } + // alternate + takeThis = !takeThis; + } + if (lastUnusedToken != null) { + // given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing + // `ake` to complete the logic. + tokens.add(lastUnusedToken); + } + tokenizer.end(); + tokenizer.close(); + } catch (IOException ioe) { + throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment [" + fragment + "]"); + } + } + } + + if (patternStructure.isMatchAll()) { + return new MatchAllDocsQuery(); + } + BooleanQuery approximation = createApproximationQuery(tokens); + if (approximation.clauses().size() > 1 || patternStructure.needsVerification()) { + BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder(); + verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST)); + Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), wildcardPattern)); + verifyingBuilder.add(new BooleanClause(new AutomatonQueryOnBinaryDv(name(), wildcardPattern, automaton), Occur.MUST)); + return verifyingBuilder.build(); + } + return approximation; + } + + private BooleanQuery createApproximationQuery(ArrayList tokens) { + BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); + if (tokens.size() <= MAX_CLAUSES_IN_APPROXIMATION_QUERY) { + for (String token : tokens) { + addClause(token, bqBuilder); + } + return bqBuilder.build(); + } + // Thin out the number of clauses using a selection spread evenly across the range + float step = (float) (tokens.size() - 1) / (float) (MAX_CLAUSES_IN_APPROXIMATION_QUERY - 1); // set step size + for (int i = 0; i < MAX_CLAUSES_IN_APPROXIMATION_QUERY; i++) { + addClause(tokens.get(Math.round(step * i)), bqBuilder); // add each element of a position which is a multiple of step + } + // TODO we can be smarter about pruning here. e.g. + // * Avoid wildcard queries if there are sufficient numbers of other terms that are full 3grams that are cheaper term queries + // * We can select terms on their scarcity rather than even spreads across the search string. + + return bqBuilder.build(); + } + + private void addClause(String token, BooleanQuery.Builder bqBuilder) { + assert token.codePointCount(0, token.length()) <= NGRAM_SIZE; + if (token.codePointCount(0, token.length()) == NGRAM_SIZE) { + TermQuery tq = new TermQuery(new Term(name(), token)); + bqBuilder.add(new BooleanClause(tq, Occur.MUST)); + } else { + WildcardQuery wq = new WildcardQuery(new Term(name(), token + "*")); + wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE); + bqBuilder.add(new BooleanClause(wq, Occur.MUST)); + } + + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public Query existsQuery(QueryShardContext context) { + return new DocValuesFieldExistsQuery(name()); + } + + @Override + public Query termQuery(Object value, QueryShardContext context) { + return wildcardQuery(BytesRefs.toString(value), MultiTermQuery.CONSTANT_SCORE_REWRITE, context); + } + + @Override + public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) { + return wildcardQuery(value + "*", method, context); + } + + @Override + public Query termsQuery(List values, QueryShardContext context) { + BooleanQuery.Builder bq = new BooleanQuery.Builder(); + for (Object value : values) { + bq.add(termQuery(value, context), Occur.SHOULD); + } + return new ConstantScoreQuery(bq.build()); + } + + @Override + public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) { + failIfNoDocValues(); + return new IndexFieldData.Builder() { + + @Override + public IndexFieldData build(IndexSettings indexSettings, MappedFieldType fieldType, IndexFieldDataCache cache, + CircuitBreakerService breakerService, MapperService mapperService) { + return new WildcardBytesBinaryDVIndexFieldData(indexSettings.getIndex(), fieldType.name()); + }}; + } + } + + static class WildcardBytesBinaryDVIndexFieldData extends BytesBinaryDVIndexFieldData{ + + WildcardBytesBinaryDVIndexFieldData(Index index, String fieldName) { + super(index, fieldName); + } + + @Override + public SortField sortField(Object missingValue, MultiValueMode sortMode, Nested nested, boolean reverse) { + XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue, + sortMode, nested); + return new SortField(getFieldName(), source, reverse); + } + + } + + private int ignoreAbove; + + private WildcardFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, + int ignoreAbove, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { + super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); + this.ignoreAbove = ignoreAbove; + assert fieldType.indexOptions() == IndexOptions.DOCS; + + ngramFieldType = fieldType.clone(); + ngramFieldType.setTokenized(true); + ngramFieldType.freeze(); + } + + /** Values that have more chars than the return value of this method will + * be skipped at parsing time. */ + // pkg-private for testing + int ignoreAbove() { + return ignoreAbove; + } + + @Override + protected WildcardFieldMapper clone() { + return (WildcardFieldMapper) super.clone(); + } + + @Override + public WildcardFieldType fieldType() { + return (WildcardFieldType) super.fieldType(); + } + + @Override + protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException { + super.doXContentBody(builder, includeDefaults, params); + if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) { + builder.field("ignore_above", ignoreAbove); + } + } + + @Override + protected void parseCreateField(ParseContext context, List fields) throws IOException { + final String value; + if (context.externalValueSet()) { + value = context.externalValue().toString(); + } else { + XContentParser parser = context.parser(); + if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { + value = fieldType().nullValueAsString(); + } else { + value = parser.textOrNull(); + } + } + ParseContext.Document parseDoc = context.doc(); + + createFields(value, parseDoc, fields); + } + + // For internal use by Lucene only - used to define ngram index + final MappedFieldType ngramFieldType; + + void createFields(String value, Document parseDoc, Listfields) { + if (value == null || value.length() > ignoreAbove) { + return; + } + String ngramValue = TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR; + Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType); + fields.add(ngramField); + + CustomBinaryDocValuesField dvField = (CustomBinaryDocValuesField) parseDoc.getByKey(fieldType().name()); + if (dvField == null) { + dvField = new CustomBinaryDocValuesField(fieldType().name(), value.getBytes(StandardCharsets.UTF_8)); + parseDoc.addWithKey(fieldType().name(), dvField); + } else { + dvField.add(value.getBytes(StandardCharsets.UTF_8)); + } + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + + @Override + protected void doMerge(Mapper mergeWith) { + super.doMerge(mergeWith); + this.ignoreAbove = ((WildcardFieldMapper) mergeWith).ignoreAbove; + } +} diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java new file mode 100644 index 00000000000..f8c8ddc7f5e --- /dev/null +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java @@ -0,0 +1,331 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.wildcard.mapper; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.cache.bitset.BitsetFilterCache; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.fielddata.IndexFieldDataCache; +import org.elasticsearch.index.mapper.ContentPath; +import org.elasticsearch.index.mapper.KeywordFieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.Mapper; +import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.mapper.ParseContext; +import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.search.sort.FieldSortBuilder; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.xpack.wildcard.mapper.WildcardFieldMapper.Builder; +import org.junit.Before; +import org.mockito.Mockito; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.function.BiFunction; + +import static org.hamcrest.Matchers.equalTo; + +public class WildcardFieldMapperTests extends ESTestCase { + + private static final String KEYWORD_FIELD_NAME = "keyword_field"; + private static final String WILDCARD_FIELD_NAME = "wildcard_field"; + static final int MAX_FIELD_LENGTH = 100; + static WildcardFieldMapper wildcardFieldType; + static KeywordFieldMapper keywordFieldType; + + @Override + @Before + public void setUp() throws Exception { + Builder builder = new WildcardFieldMapper.Builder(WILDCARD_FIELD_NAME); + builder.ignoreAbove(MAX_FIELD_LENGTH); + wildcardFieldType = builder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0))); + + + org.elasticsearch.index.mapper.KeywordFieldMapper.Builder kwBuilder = new KeywordFieldMapper.Builder(KEYWORD_FIELD_NAME); + keywordFieldType = kwBuilder.build(new Mapper.BuilderContext(createIndexSettings().getSettings(), new ContentPath(0))); + super.setUp(); + } + + public void testIllegalDocValuesArgument() { + Builder ft = new WildcardFieldMapper.Builder("test"); + MapperParsingException e = expectThrows(MapperParsingException.class, + () -> ft.docValues(false)); + assertEquals("The field [test] cannot have doc values = false", e.getMessage()); + } + + public void testIllegalIndexedArgument() { + Builder ft = new WildcardFieldMapper.Builder("test"); + MapperParsingException e = expectThrows(MapperParsingException.class, + () -> ft.index(false)); + assertEquals("The field [test] cannot have index = false", e.getMessage()); + } + + public void testTooBigKeywordField() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER); + iwc.setMergePolicy(newTieredMergePolicy(random())); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + // Create a string that is too large and will not be indexed + String docContent = randomABString(MAX_FIELD_LENGTH + 1); + Document doc = new Document(); + ParseContext.Document parseDoc = new ParseContext.Document(); + addFields(parseDoc, doc, docContent); + indexDoc(parseDoc, doc, iw); + + iw.forceMerge(1); + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery("*a*", null, null); + TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER); + assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L)); + + reader.close(); + dir.close(); + } + + //Test long query strings don't cause exceptions + public void testTooBigQueryField() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER); + iwc.setMergePolicy(newTieredMergePolicy(random())); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + // Create a string that is too large and will not be indexed + String docContent = randomABString(10); + Document doc = new Document(); + ParseContext.Document parseDoc = new ParseContext.Document(); + addFields(parseDoc, doc, docContent); + indexDoc(parseDoc, doc, iw); + + iw.forceMerge(1); + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + String queryString = randomABString((BooleanQuery.getMaxClauseCount() * 2) + 1); + Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(queryString, null, null); + TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER); + assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L)); + + reader.close(); + dir.close(); + } + + + public void testSearchResultsVersusKeywordField() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER); + iwc.setMergePolicy(newTieredMergePolicy(random())); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + int numDocs = 100; + HashSet values = new HashSet<>(); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + ParseContext.Document parseDoc = new ParseContext.Document(); + String docContent = randomABString(1 + randomInt(MAX_FIELD_LENGTH - 1)); + if (values.contains(docContent) == false) { + addFields(parseDoc, doc, docContent); + values.add(docContent); + } + // Occasionally add a multi-value field + if (randomBoolean()) { + docContent = randomABString(1 + randomInt(MAX_FIELD_LENGTH - 1)); + if (values.contains(docContent) == false) { + addFields(parseDoc, doc, docContent); + values.add(docContent); + } + } + indexDoc(parseDoc, doc, iw); + + } + + iw.forceMerge(1); + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); + iw.close(); + + int numSearches = 100; + for (int i = 0; i < numSearches; i++) { + String randomWildcardPattern = getRandomWildcardPattern(); + + Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(randomWildcardPattern, null, null); + TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.INDEXORDER); + + Query keywordFieldQuery = new WildcardQuery(new Term(KEYWORD_FIELD_NAME, randomWildcardPattern)); + TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.INDEXORDER); + + assertThat(kwTopDocs.totalHits.value, equalTo(wildcardFieldTopDocs.totalHits.value)); + + HashSet expectedDocs = new HashSet<>(); + for (ScoreDoc topDoc : kwTopDocs.scoreDocs) { + expectedDocs.add(topDoc.doc); + } + for (ScoreDoc wcTopDoc : wildcardFieldTopDocs.scoreDocs) { + assertTrue(expectedDocs.remove(wcTopDoc.doc)); + } + assertThat(expectedDocs.size(), equalTo(0)); + } + + + //Test keyword and wildcard sort operations are also equivalent + QueryShardContext shardContextMock = createMockShardContext(); + + FieldSortBuilder wildcardSortBuilder = new FieldSortBuilder(WILDCARD_FIELD_NAME); + SortField wildcardSortField = wildcardSortBuilder.build(shardContextMock).field; + ScoreDoc[] wildcardHits = searcher.search(new MatchAllDocsQuery(), numDocs, new Sort(wildcardSortField)).scoreDocs; + + FieldSortBuilder keywordSortBuilder = new FieldSortBuilder(KEYWORD_FIELD_NAME); + SortField keywordSortField = keywordSortBuilder.build(shardContextMock).field; + ScoreDoc[] keywordHits = searcher.search(new MatchAllDocsQuery(), numDocs, new Sort(keywordSortField)).scoreDocs; + + assertThat(wildcardHits.length, equalTo(keywordHits.length)); + for (int i = 0; i < wildcardHits.length; i++) { + assertThat(wildcardHits[i].doc, equalTo(keywordHits[i].doc)); + } + + reader.close(); + dir.close(); + } + + + + protected MappedFieldType provideMappedFieldType(String name) { + if (name.equals(WILDCARD_FIELD_NAME)) { + return wildcardFieldType.fieldType(); + } else { + return keywordFieldType.fieldType(); + } + } + + protected final QueryShardContext createMockShardContext() { + Index index = new Index(randomAlphaOfLengthBetween(1, 10), "_na_"); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings(index, + Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build()); + BitsetFilterCache bitsetFilterCache = new BitsetFilterCache(idxSettings, Mockito.mock(BitsetFilterCache.Listener.class)); + BiFunction> indexFieldDataLookup = (fieldType, fieldIndexName) -> { + IndexFieldData.Builder builder = fieldType.fielddataBuilder(fieldIndexName); + return builder.build(idxSettings, fieldType, new IndexFieldDataCache.None(), null, null); + }; + return new QueryShardContext(0, idxSettings, BigArrays.NON_RECYCLING_INSTANCE, bitsetFilterCache, indexFieldDataLookup, + null, null, null, xContentRegistry(), null, null, null, + () -> randomNonNegativeLong(), null, null, () -> true) { + + @Override + public MappedFieldType fieldMapper(String name) { + return provideMappedFieldType(name); + } + }; + } + + private void addFields(ParseContext.Document parseDoc, Document doc, String docContent) throws IOException { + ArrayList fields = new ArrayList<>(); + wildcardFieldType.createFields(docContent, parseDoc, fields); + + for (IndexableField indexableField : fields) { + doc.add(indexableField); + } + // Add keyword fields too + doc.add(new SortedSetDocValuesField(KEYWORD_FIELD_NAME, new BytesRef(docContent))); + doc.add(new StringField(KEYWORD_FIELD_NAME, docContent, Field.Store.YES)); + } + + private void indexDoc(ParseContext.Document parseDoc, Document doc, RandomIndexWriter iw) throws IOException { + IndexableField field = parseDoc.getByKey(wildcardFieldType.name()); + if (field != null) { + doc.add(field); + } + iw.addDocument(doc); + } + + protected IndexSettings createIndexSettings() { + return new IndexSettings( + IndexMetaData.builder("_index").settings(Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)) + .numberOfShards(1).numberOfReplicas(0).creationDate(System.currentTimeMillis()).build(), + Settings.EMPTY); + } + + + static String randomABString(int minLength) { + StringBuilder sb = new StringBuilder(); + while (sb.length() < minLength) { + if (randomBoolean()) { + sb.append("a"); + } else { + sb.append("b"); + } + } + return sb.toString(); + } + + private void randomSyntaxChar(StringBuilder sb) { + switch (randomInt(3)) { + case 0: + sb.append(WildcardQuery.WILDCARD_CHAR); + break; + case 1: + sb.append(WildcardQuery.WILDCARD_STRING); + break; + case 2: + sb.append(WildcardQuery.WILDCARD_ESCAPE); + sb.append(WildcardQuery.WILDCARD_STRING); + break; + case 3: + sb.append(WildcardQuery.WILDCARD_ESCAPE); + sb.append(WildcardQuery.WILDCARD_CHAR); + break; + } + } + + private String getRandomWildcardPattern() { + StringBuilder sb = new StringBuilder(); + int numFragments = 1 + randomInt(4); + if (randomInt(10) == 1) { + randomSyntaxChar(sb); + } + for (int i = 0; i < numFragments; i++) { + if (i > 0) { + randomSyntaxChar(sb); + } + sb.append(randomABString(1 + randomInt(6))); + } + if (randomInt(10) == 1) { + randomSyntaxChar(sb); + } + return sb.toString(); + } +} diff --git a/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java new file mode 100644 index 00000000000..0d090e4a250 --- /dev/null +++ b/x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldTypeTests.java @@ -0,0 +1,19 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + + +package org.elasticsearch.xpack.wildcard.mapper; + +import org.elasticsearch.index.mapper.FieldTypeTestCase; +import org.elasticsearch.index.mapper.MappedFieldType; + +public class WildcardFieldTypeTests extends FieldTypeTestCase { + + @Override + protected MappedFieldType createDefaultFieldType() { + return new WildcardFieldMapper.WildcardFieldType(); + } +}