Backport of #55548 Adds equivalence for keyword field to the wildcard field. Regex, fuzzy, wildcard and prefix queries are all supported. All queries use an approximation query backed by an automaton-based verification queries. Closes #54275
This commit is contained in:
parent
9f1e3bc82b
commit
b2bc6071fd
|
@ -1,8 +1,8 @@
|
|||
setup:
|
||||
- skip:
|
||||
features: headers
|
||||
version: " - 7.7.99"
|
||||
reason: "wildcard fields were added from 7.8"
|
||||
version: " - 7.8.99"
|
||||
reason: "wildcard fields were added from 7.9"
|
||||
|
||||
- do:
|
||||
indices.create:
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
/**
|
||||
* Query that runs an Automaton across all binary doc values.
|
||||
|
@ -33,18 +34,19 @@ public class AutomatonQueryOnBinaryDv extends Query {
|
|||
|
||||
private final String field;
|
||||
private final String matchPattern;
|
||||
private final Automaton automaton;
|
||||
private final Supplier<Automaton> automatonSupplier;
|
||||
|
||||
public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton) {
|
||||
public AutomatonQueryOnBinaryDv(String field, String matchPattern, Supplier<Automaton> automatonSupplier) {
|
||||
this.field = field;
|
||||
this.matchPattern = matchPattern;
|
||||
this.automaton = automaton;
|
||||
this.automatonSupplier = automatonSupplier;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
||||
|
||||
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
|
||||
|
||||
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automatonSupplier.get());
|
||||
|
||||
return new ConstantScoreWeight(this, boost) {
|
||||
|
||||
|
@ -92,6 +94,9 @@ public class AutomatonQueryOnBinaryDv extends Query {
|
|||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == null || obj.getClass() != getClass()) {
|
||||
return false;
|
||||
}
|
||||
AutomatonQueryOnBinaryDv other = (AutomatonQueryOnBinaryDv) obj;
|
||||
return Objects.equals(field, other.field) && Objects.equals(matchPattern, other.matchPattern);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||
* or more contributor license agreements. Licensed under the Elastic License;
|
||||
* you may not use this file except in compliance with the Elastic License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.xpack.wildcard.mapper;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.QueryVisitor;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A query that matches all documents. The class is more of a marker
|
||||
* that we encountered something that will need verification.
|
||||
* (A MatchAllDocs query is used to indicate we can match all
|
||||
* _without_ verification)
|
||||
*/
|
||||
public final class MatchAllButRequireVerificationQuery extends Query {
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
return new MatchAllDocsQuery();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "*:* (tbc)";
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
return sameClassAs(o);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return classHash();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(QueryVisitor visitor) {
|
||||
visitor.visitLeaf(this);
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -21,24 +21,32 @@ import org.apache.lucene.search.BooleanClause.Occur;
|
|||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.DocValuesFieldExistsQuery;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.MatchNoDocsQuery;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
import org.apache.lucene.util.automaton.RegExp.Kind;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.ElasticsearchParseException;
|
||||
import org.elasticsearch.common.lucene.BytesRefs;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.unit.Fuzziness;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.common.xcontent.support.XContentMapValues;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AnalyzerScope;
|
||||
import org.elasticsearch.index.analysis.LowercaseNormalizer;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.fielddata.IndexFieldData;
|
||||
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
|
||||
|
@ -63,11 +71,16 @@ import org.elasticsearch.search.aggregations.support.ValuesSourceType;
|
|||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import static org.elasticsearch.index.mapper.TypeParsers.parseField;
|
||||
import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;
|
||||
|
||||
/**
|
||||
* A {@link FieldMapper} for indexing fields with ngrams for efficient wildcard matching
|
||||
|
@ -206,9 +219,13 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
}
|
||||
|
||||
public static final char TOKEN_START_OR_END_CHAR = 0;
|
||||
public static final String TOKEN_START_STRING = Character.toString(TOKEN_START_OR_END_CHAR);
|
||||
public static final String TOKEN_END_STRING = TOKEN_START_STRING + TOKEN_START_STRING;
|
||||
|
||||
public static final class WildcardFieldType extends MappedFieldType {
|
||||
|
||||
static Analyzer lowercaseNormalizer = new LowercaseNormalizer();
|
||||
|
||||
public WildcardFieldType() {
|
||||
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
|
||||
setSearchAnalyzer(Lucene.KEYWORD_ANALYZER);
|
||||
|
@ -223,145 +240,392 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
return result;
|
||||
}
|
||||
|
||||
// Holds parsed information about the wildcard pattern
|
||||
static class PatternStructure {
|
||||
boolean openStart, openEnd, hasSymbols;
|
||||
int lastGap =0;
|
||||
int wildcardCharCount, wildcardStringCount;
|
||||
String[] fragments;
|
||||
Integer [] precedingGapSizes;
|
||||
final String pattern;
|
||||
|
||||
@SuppressWarnings("fallthrough") // Intentionally uses fallthrough mirroring implementation in Lucene's WildcardQuery
|
||||
PatternStructure (String wildcardText) {
|
||||
this.pattern = wildcardText;
|
||||
ArrayList<String> fragmentList = new ArrayList<>();
|
||||
ArrayList<Integer> precedingGapSizeList = new ArrayList<>();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < wildcardText.length();) {
|
||||
final int c = wildcardText.codePointAt(i);
|
||||
int length = Character.charCount(c);
|
||||
switch (c) {
|
||||
case WildcardQuery.WILDCARD_STRING:
|
||||
if (i == 0) {
|
||||
openStart = true;
|
||||
}
|
||||
openEnd = true;
|
||||
hasSymbols = true;
|
||||
wildcardStringCount++;
|
||||
|
||||
if (sb.length() > 0) {
|
||||
precedingGapSizeList.add(lastGap);
|
||||
fragmentList.add(sb.toString());
|
||||
sb = new StringBuilder();
|
||||
}
|
||||
lastGap = Integer.MAX_VALUE;
|
||||
break;
|
||||
case WildcardQuery.WILDCARD_CHAR:
|
||||
if (i == 0) {
|
||||
openStart = true;
|
||||
}
|
||||
hasSymbols = true;
|
||||
wildcardCharCount++;
|
||||
openEnd = true;
|
||||
if (sb.length() > 0) {
|
||||
precedingGapSizeList.add(lastGap);
|
||||
fragmentList.add(sb.toString());
|
||||
sb = new StringBuilder();
|
||||
lastGap = 0;
|
||||
}
|
||||
|
||||
if (lastGap != Integer.MAX_VALUE) {
|
||||
lastGap++;
|
||||
}
|
||||
break;
|
||||
case WildcardQuery.WILDCARD_ESCAPE:
|
||||
// add the next codepoint instead, if it exists
|
||||
if (i + length < wildcardText.length()) {
|
||||
final int nextChar = wildcardText.codePointAt(i + length);
|
||||
length += Character.charCount(nextChar);
|
||||
sb.append(Character.toChars(nextChar));
|
||||
openEnd = false;
|
||||
break;
|
||||
} // else fallthru, lenient parsing with a trailing \
|
||||
default:
|
||||
openEnd = false;
|
||||
sb.append(Character.toChars(c));
|
||||
}
|
||||
i += length;
|
||||
}
|
||||
if (sb.length() > 0) {
|
||||
precedingGapSizeList.add(lastGap);
|
||||
fragmentList.add(sb.toString());
|
||||
lastGap = 0;
|
||||
}
|
||||
fragments = fragmentList.toArray(new String[0]);
|
||||
precedingGapSizes = precedingGapSizeList.toArray(new Integer[0]);
|
||||
|
||||
}
|
||||
|
||||
public boolean needsVerification() {
|
||||
// Return true if term queries are not enough evidence
|
||||
if (fragments.length == 1 && wildcardCharCount == 0) {
|
||||
// The one case where we don't need verification is when
|
||||
// we have a single fragment and no ? characters
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns number of positions for last gap (Integer.MAX means unlimited gap)
|
||||
public int getPrecedingGapSize(int fragmentNum) {
|
||||
return precedingGapSizes[fragmentNum];
|
||||
}
|
||||
|
||||
public boolean isMatchAll() {
|
||||
return fragments.length == 0 && wildcardStringCount >0 && wildcardCharCount ==0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return pattern.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
PatternStructure other = (PatternStructure) obj;
|
||||
return pattern.equals(other.pattern);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) {
|
||||
PatternStructure patternStructure = new PatternStructure(wildcardPattern);
|
||||
ArrayList<String> tokens = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < patternStructure.fragments.length; i++) {
|
||||
String fragment = patternStructure.fragments[i];
|
||||
int fLength = fragment.length();
|
||||
if (fLength == 0) {
|
||||
continue;
|
||||
}
|
||||
String ngramIndexPattern = addLineEndChars(toLowerCase(wildcardPattern));
|
||||
|
||||
// Add any start/end of string character
|
||||
if (i == 0 && patternStructure.openStart == false) {
|
||||
// Start-of-string anchored (is not a leading wildcard)
|
||||
fragment = TOKEN_START_OR_END_CHAR + fragment;
|
||||
// Break search term into tokens
|
||||
Set<String> tokens = new LinkedHashSet<>();
|
||||
StringBuilder sequence = new StringBuilder();
|
||||
int numWildcardChars = 0;
|
||||
int numWildcardStrings = 0;
|
||||
for (int i = 0; i < ngramIndexPattern.length();) {
|
||||
final int c = ngramIndexPattern.codePointAt(i);
|
||||
int length = Character.charCount(c);
|
||||
switch (c) {
|
||||
case WildcardQuery.WILDCARD_STRING:
|
||||
if (sequence.length() > 0) {
|
||||
getNgramTokens(tokens, sequence.toString());
|
||||
sequence = new StringBuilder();
|
||||
}
|
||||
if (patternStructure.openEnd == false && i == patternStructure.fragments.length - 1) {
|
||||
// End-of-string anchored (is not a trailing wildcard)
|
||||
fragment = fragment + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
|
||||
numWildcardStrings++;
|
||||
break;
|
||||
case WildcardQuery.WILDCARD_CHAR:
|
||||
if (sequence.length() > 0) {
|
||||
getNgramTokens(tokens, sequence.toString());
|
||||
sequence = new StringBuilder();
|
||||
}
|
||||
if (fragment.codePointCount(0, fragment.length()) <= NGRAM_SIZE) {
|
||||
tokens.add(fragment);
|
||||
numWildcardChars++;
|
||||
break;
|
||||
case WildcardQuery.WILDCARD_ESCAPE:
|
||||
// add the next codepoint instead, if it exists
|
||||
if (i + length < ngramIndexPattern.length()) {
|
||||
final int nextChar = ngramIndexPattern.codePointAt(i + length);
|
||||
length += Character.charCount(nextChar);
|
||||
sequence.append(Character.toChars(nextChar));
|
||||
} else {
|
||||
sequence.append(Character.toChars(c));
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
sequence.append(Character.toChars(c));
|
||||
}
|
||||
i += length;
|
||||
}
|
||||
|
||||
if (sequence.length() > 0) {
|
||||
getNgramTokens(tokens, sequence.toString());
|
||||
}
|
||||
|
||||
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
|
||||
int clauseCount = 0;
|
||||
for (String string : tokens) {
|
||||
if (clauseCount >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
|
||||
break;
|
||||
}
|
||||
addClause(string, rewritten, Occur.MUST);
|
||||
clauseCount++;
|
||||
}
|
||||
Supplier<Automaton> deferredAutomatonSupplier = () -> {
|
||||
return WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
|
||||
};
|
||||
AutomatonQueryOnBinaryDv verifyingQuery = new AutomatonQueryOnBinaryDv(name(), wildcardPattern, deferredAutomatonSupplier);
|
||||
if (clauseCount > 0) {
|
||||
// We can accelerate execution with the ngram query
|
||||
BooleanQuery approxQuery = rewritten.build();
|
||||
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
|
||||
verifyingBuilder.add(new BooleanClause(approxQuery, Occur.MUST));
|
||||
verifyingBuilder.add(new BooleanClause(verifyingQuery, Occur.MUST));
|
||||
return verifyingBuilder.build();
|
||||
} else if (numWildcardChars == 0 || numWildcardStrings > 0) {
|
||||
// We have no concrete characters and we're not a pure length query e.g. ???
|
||||
return new DocValuesFieldExistsQuery(name());
|
||||
}
|
||||
return verifyingQuery;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query regexpQuery(String value, int flags, int maxDeterminizedStates, RewriteMethod method, QueryShardContext context) {
|
||||
if (value.length() == 0) {
|
||||
return new MatchNoDocsQuery();
|
||||
}
|
||||
|
||||
if (context.allowExpensiveQueries() == false) {
|
||||
throw new ElasticsearchException(
|
||||
"[regexp] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false."
|
||||
);
|
||||
}
|
||||
|
||||
RegExp ngramRegex = new RegExp(addLineEndChars(toLowerCase(value)), flags);
|
||||
|
||||
Query approxBooleanQuery = toApproximationQuery(ngramRegex);
|
||||
Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery);
|
||||
|
||||
// MatchAll is a special case meaning the regex is known to match everything .* and
|
||||
// there is no need for verification.
|
||||
if (approxNgramQuery instanceof MatchAllDocsQuery) {
|
||||
return existsQuery(context);
|
||||
}
|
||||
Supplier<Automaton> deferredAutomatonSupplier = ()-> {
|
||||
RegExp regex = new RegExp(value, flags);
|
||||
return regex.toAutomaton(maxDeterminizedStates);
|
||||
};
|
||||
|
||||
AutomatonQueryOnBinaryDv verifyingQuery = new AutomatonQueryOnBinaryDv(name(), value, deferredAutomatonSupplier);
|
||||
|
||||
// MatchAllButRequireVerificationQuery is a special case meaning the regex is reduced to a single
|
||||
// clause which we can't accelerate at all and needs verification. Example would be ".."
|
||||
if (approxNgramQuery instanceof MatchAllButRequireVerificationQuery) {
|
||||
return verifyingQuery;
|
||||
}
|
||||
|
||||
// We can accelerate execution with the ngram query
|
||||
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
|
||||
verifyingBuilder.add(new BooleanClause(approxNgramQuery, Occur.MUST));
|
||||
verifyingBuilder.add(new BooleanClause(verifyingQuery, Occur.MUST));
|
||||
return verifyingBuilder.build();
|
||||
}
|
||||
|
||||
// Convert a regular expression to a simplified query consisting of BooleanQuery and TermQuery objects
|
||||
// which captures as much of the logic as possible. Query can produce some false positives but shouldn't
|
||||
// produce any false negatives.
|
||||
// In addition to Term and BooleanQuery clauses there are MatchAllDocsQuery objects (e.g for .*) and
|
||||
// a RegExpQuery if we can't resolve to any of the above.
|
||||
// * If an expression resolves to a single MatchAllDocsQuery eg .* then a match all shortcut is possible with
|
||||
// no verification needed.
|
||||
// * If an expression resolves to a RegExpQuery eg ?? then only the verification
|
||||
// query is run.
|
||||
// * Anything else is a concrete query that should be run on the ngram index.
|
||||
public static Query toApproximationQuery(RegExp r) throws IllegalArgumentException {
|
||||
Query result = null;
|
||||
switch (r.kind) {
|
||||
case REGEXP_UNION:
|
||||
result = createUnionQuery(r);
|
||||
break;
|
||||
case REGEXP_CONCATENATION:
|
||||
result = createConcatenationQuery(r);
|
||||
break;
|
||||
case REGEXP_STRING:
|
||||
String normalizedString = toLowerCase(r.s);
|
||||
result = new TermQuery(new Term("", normalizedString));
|
||||
break;
|
||||
case REGEXP_CHAR:
|
||||
String cs = new StringBuilder().appendCodePoint(r.c).toString();
|
||||
String normalizedChar = toLowerCase(cs);
|
||||
result = new TermQuery(new Term("", normalizedChar));
|
||||
break;
|
||||
case REGEXP_REPEAT:
|
||||
// Repeat is zero or more times so zero matches = match all
|
||||
result = new MatchAllDocsQuery();
|
||||
break;
|
||||
|
||||
case REGEXP_REPEAT_MIN:
|
||||
case REGEXP_REPEAT_MINMAX:
|
||||
if (r.min > 0) {
|
||||
result = toApproximationQuery(r.exp1);
|
||||
if(result instanceof TermQuery) {
|
||||
// Wrap the repeating expression so that it is not concatenated by a parent which concatenates
|
||||
// plain TermQuery objects together. Boolean queries are interpreted as a black box and not
|
||||
// concatenated.
|
||||
BooleanQuery.Builder wrapper = new BooleanQuery.Builder();
|
||||
wrapper.add(result, Occur.MUST);
|
||||
result = wrapper.build();
|
||||
}
|
||||
} else {
|
||||
// Expressions like (a){0,3} match empty string or up to 3 a's.
|
||||
result = new MatchAllButRequireVerificationQuery();
|
||||
}
|
||||
break;
|
||||
case REGEXP_ANYSTRING:
|
||||
// optimisation for .* queries - match all and no verification stage required.
|
||||
result = new MatchAllDocsQuery();
|
||||
break;
|
||||
// All other kinds of expression cannot be represented as a boolean or term query so return an object
|
||||
// that indicates verification is required
|
||||
case REGEXP_OPTIONAL:
|
||||
case REGEXP_INTERSECTION:
|
||||
case REGEXP_COMPLEMENT:
|
||||
case REGEXP_CHAR_RANGE:
|
||||
case REGEXP_ANYCHAR:
|
||||
case REGEXP_INTERVAL:
|
||||
case REGEXP_EMPTY:
|
||||
case REGEXP_AUTOMATON:
|
||||
result = new MatchAllButRequireVerificationQuery();
|
||||
break;
|
||||
}
|
||||
assert result != null; // All regex types are understood and translated to a query.
|
||||
return result;
|
||||
}
|
||||
|
||||
private static Query createConcatenationQuery(RegExp r) {
|
||||
// Create ANDs of expressions plus collapse consecutive TermQuerys into single longer ones
|
||||
ArrayList<Query> queries = new ArrayList<>();
|
||||
findLeaves(r.exp1, Kind.REGEXP_CONCATENATION, queries);
|
||||
findLeaves(r.exp2, Kind.REGEXP_CONCATENATION, queries);
|
||||
BooleanQuery.Builder bAnd = new BooleanQuery.Builder();
|
||||
StringBuilder sequence = new StringBuilder();
|
||||
for (Query query : queries) {
|
||||
if (query instanceof TermQuery) {
|
||||
TermQuery tq = (TermQuery) query;
|
||||
sequence.append(tq.getTerm().text());
|
||||
} else {
|
||||
if (sequence.length() > 0) {
|
||||
bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST);
|
||||
sequence = new StringBuilder();
|
||||
}
|
||||
bAnd.add(query, Occur.MUST);
|
||||
}
|
||||
}
|
||||
if (sequence.length() > 0) {
|
||||
bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST);
|
||||
}
|
||||
BooleanQuery combined = bAnd.build();
|
||||
if (combined.clauses().size() > 0) {
|
||||
return combined;
|
||||
}
|
||||
// There's something in the regex we couldn't represent as a query - resort to a match all with verification
|
||||
return new MatchAllButRequireVerificationQuery();
|
||||
|
||||
}
|
||||
|
||||
private static Query createUnionQuery(RegExp r) {
|
||||
// Create an OR of clauses
|
||||
ArrayList<Query> queries = new ArrayList<>();
|
||||
findLeaves(r.exp1, Kind.REGEXP_UNION, queries);
|
||||
findLeaves(r.exp2, Kind.REGEXP_UNION, queries);
|
||||
BooleanQuery.Builder bOr = new BooleanQuery.Builder();
|
||||
HashSet<Query> uniqueClauses = new HashSet<>();
|
||||
for (Query query : queries) {
|
||||
if (uniqueClauses.add(query)) {
|
||||
bOr.add(query, Occur.SHOULD);
|
||||
}
|
||||
}
|
||||
if (uniqueClauses.size() > 0) {
|
||||
if (uniqueClauses.size() == 1) {
|
||||
// Fully-understood ORs that collapse to a single term should be returned minus
|
||||
// the BooleanQuery wrapper so that they might be concatenated.
|
||||
// Helps turn [Pp][Oo][Ww][Ee][Rr][Ss][Hh][Ee][Ll][Ll] into "powershell"
|
||||
// Each char pair eg (P OR p) can be normalized to (p) which can be a single term
|
||||
return uniqueClauses.iterator().next();
|
||||
} else {
|
||||
return bOr.build();
|
||||
}
|
||||
}
|
||||
// There's something in the regex we couldn't represent as a query - resort to a match all with verification
|
||||
return new MatchAllButRequireVerificationQuery();
|
||||
}
|
||||
|
||||
private static void findLeaves(RegExp exp, Kind kind, List<Query> queries) {
|
||||
if (exp.kind == kind) {
|
||||
findLeaves(exp.exp1, kind, queries);
|
||||
findLeaves( exp.exp2, kind, queries);
|
||||
} else {
|
||||
queries.add(toApproximationQuery(exp));
|
||||
}
|
||||
}
|
||||
|
||||
private static String toLowerCase(String string) {
|
||||
return lowercaseNormalizer.normalize(null, string).utf8ToString();
|
||||
}
|
||||
|
||||
// Takes a BooleanQuery + TermQuery tree representing query logic and rewrites using ngrams of appropriate size.
|
||||
private Query rewriteBoolToNgramQuery(Query approxQuery) {
|
||||
//TODO optimise more intelligently so we:
|
||||
// 1) favour full-length term queries eg abc over short eg a* when pruning too many clauses.
|
||||
// 2) make MAX_CLAUSES_IN_APPROXIMATION_QUERY a global cap rather than per-boolean clause.
|
||||
if (approxQuery == null) {
|
||||
return null;
|
||||
}
|
||||
if (approxQuery instanceof BooleanQuery) {
|
||||
BooleanQuery bq = (BooleanQuery) approxQuery;
|
||||
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
|
||||
int clauseCount = 0;
|
||||
for (BooleanClause clause : bq) {
|
||||
Query q = rewriteBoolToNgramQuery(clause.getQuery());
|
||||
if (q != null) {
|
||||
if (clause.getOccur().equals(Occur.MUST)) {
|
||||
// Can't drop "should" clauses because it can elevate a sibling optional item
|
||||
// to mandatory (shoulds with 1 clause) causing false negatives
|
||||
// Dropping MUSTs increase false positives which are OK because are verified anyway.
|
||||
clauseCount++;
|
||||
if (clauseCount >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
rewritten.add(q, clause.getOccur());
|
||||
}
|
||||
}
|
||||
return simplify(rewritten.build());
|
||||
}
|
||||
if (approxQuery instanceof TermQuery) {
|
||||
TermQuery tq = (TermQuery) approxQuery;
|
||||
|
||||
//Remove simple terms that are only string beginnings or ends.
|
||||
String s = tq.getTerm().text();
|
||||
if (s.equals(WildcardFieldMapper.TOKEN_START_STRING) || s.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
|
||||
return new MatchAllButRequireVerificationQuery();
|
||||
}
|
||||
|
||||
// Break term into tokens
|
||||
Set<String> tokens = new LinkedHashSet<>();
|
||||
getNgramTokens(tokens, s);
|
||||
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
|
||||
for (String string : tokens) {
|
||||
addClause(string, rewritten, Occur.MUST);
|
||||
}
|
||||
return simplify(rewritten.build());
|
||||
}
|
||||
if (isMatchAll(approxQuery)) {
|
||||
return approxQuery;
|
||||
}
|
||||
throw new IllegalStateException("Invalid query type found parsing regex query:" + approxQuery);
|
||||
}
|
||||
|
||||
static Query simplify(Query input) {
|
||||
if (input instanceof BooleanQuery == false) {
|
||||
return input;
|
||||
}
|
||||
BooleanQuery result = (BooleanQuery) input;
|
||||
if (result.clauses().size() == 0) {
|
||||
// A ".*" clause can produce zero clauses in which case we return MatchAll
|
||||
return new MatchAllDocsQuery();
|
||||
}
|
||||
if (result.clauses().size() == 1) {
|
||||
return simplify(result.clauses().get(0).getQuery());
|
||||
}
|
||||
|
||||
// We may have a mix of MatchAll and concrete queries - assess if we can simplify
|
||||
int matchAllCount = 0;
|
||||
int verifyCount = 0;
|
||||
boolean allConcretesAreOptional = true;
|
||||
for (BooleanClause booleanClause : result.clauses()) {
|
||||
Query q = booleanClause.getQuery();
|
||||
if (q instanceof MatchAllDocsQuery) {
|
||||
matchAllCount++;
|
||||
} else if (q instanceof MatchAllButRequireVerificationQuery) {
|
||||
verifyCount++;
|
||||
} else {
|
||||
// Concrete query
|
||||
if (booleanClause.getOccur() != Occur.SHOULD) {
|
||||
allConcretesAreOptional = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((allConcretesAreOptional && matchAllCount > 0)) {
|
||||
// Any match all expression takes precedence over all optional concrete queries.
|
||||
return new MatchAllDocsQuery();
|
||||
}
|
||||
|
||||
if ((allConcretesAreOptional && verifyCount > 0)) {
|
||||
// Any match all expression that needs verification takes precedence over all optional concrete queries.
|
||||
return new MatchAllButRequireVerificationQuery();
|
||||
}
|
||||
|
||||
// We have some mandatory concrete queries - strip out the superfluous match all expressions
|
||||
if (allConcretesAreOptional == false && matchAllCount + verifyCount > 0) {
|
||||
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
|
||||
for (BooleanClause booleanClause : result.clauses()) {
|
||||
if (isMatchAll(booleanClause.getQuery()) == false) {
|
||||
rewritten.add(booleanClause);
|
||||
}
|
||||
}
|
||||
return simplify(rewritten.build());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static boolean isMatchAll(Query q) {
|
||||
return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
|
||||
}
|
||||
|
||||
protected void getNgramTokens(Set<String> tokens, String fragment) {
|
||||
if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
|
||||
// If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search
|
||||
// terms which can be ignored.
|
||||
return;
|
||||
}
|
||||
// Break fragment into multiple Ngrams
|
||||
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment);
|
||||
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
|
||||
String lastUnusedToken = null;
|
||||
// If fragment length < NGRAM_SIZE then it is not emitted by token stream so need
|
||||
// to initialise with the value here
|
||||
String lastUnusedToken = fragment;
|
||||
try {
|
||||
tokenizer.reset();
|
||||
boolean takeThis = true;
|
||||
|
@ -371,11 +635,16 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
String tokenValue = termAtt.toString();
|
||||
if (takeThis) {
|
||||
tokens.add(tokenValue);
|
||||
lastUnusedToken = null;
|
||||
} else {
|
||||
lastUnusedToken = tokenValue;
|
||||
}
|
||||
// alternate
|
||||
takeThis = !takeThis;
|
||||
if (tokens.size() >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
|
||||
lastUnusedToken = null;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastUnusedToken != null) {
|
||||
// given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing
|
||||
|
@ -385,56 +654,119 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
tokenizer.end();
|
||||
tokenizer.close();
|
||||
} catch (IOException ioe) {
|
||||
throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment [" + fragment + "]");
|
||||
}
|
||||
throw new ElasticsearchParseException("Error parsing wildcard regex pattern fragment [" + fragment + "]");
|
||||
}
|
||||
}
|
||||
|
||||
if (patternStructure.isMatchAll()) {
|
||||
return new MatchAllDocsQuery();
|
||||
}
|
||||
BooleanQuery approximation = createApproximationQuery(tokens);
|
||||
if (approximation.clauses().size() > 1 || patternStructure.needsVerification()) {
|
||||
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
|
||||
verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST));
|
||||
Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
|
||||
verifyingBuilder.add(new BooleanClause(new AutomatonQueryOnBinaryDv(name(), wildcardPattern, automaton), Occur.MUST));
|
||||
return verifyingBuilder.build();
|
||||
}
|
||||
return approximation;
|
||||
}
|
||||
|
||||
private BooleanQuery createApproximationQuery(ArrayList<String> tokens) {
|
||||
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
|
||||
if (tokens.size() <= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
|
||||
for (String token : tokens) {
|
||||
addClause(token, bqBuilder);
|
||||
}
|
||||
return bqBuilder.build();
|
||||
}
|
||||
// Thin out the number of clauses using a selection spread evenly across the range
|
||||
float step = (float) (tokens.size() - 1) / (float) (MAX_CLAUSES_IN_APPROXIMATION_QUERY - 1); // set step size
|
||||
for (int i = 0; i < MAX_CLAUSES_IN_APPROXIMATION_QUERY; i++) {
|
||||
addClause(tokens.get(Math.round(step * i)), bqBuilder); // add each element of a position which is a multiple of step
|
||||
}
|
||||
// TODO we can be smarter about pruning here. e.g.
|
||||
// * Avoid wildcard queries if there are sufficient numbers of other terms that are full 3grams that are cheaper term queries
|
||||
// * We can select terms on their scarcity rather than even spreads across the search string.
|
||||
|
||||
return bqBuilder.build();
|
||||
}
|
||||
|
||||
private void addClause(String token, BooleanQuery.Builder bqBuilder) {
|
||||
private void addClause(String token, BooleanQuery.Builder bqBuilder, Occur occur) {
|
||||
assert token.codePointCount(0, token.length()) <= NGRAM_SIZE;
|
||||
if (token.codePointCount(0, token.length()) == NGRAM_SIZE) {
|
||||
int tokenSize = token.codePointCount(0, token.length());
|
||||
if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
|
||||
// there's something concrete to be searched but it's too short
|
||||
// Require verification.
|
||||
bqBuilder.add(new BooleanClause(new MatchAllButRequireVerificationQuery(), occur));
|
||||
return;
|
||||
}
|
||||
if (tokenSize == NGRAM_SIZE) {
|
||||
TermQuery tq = new TermQuery(new Term(name(), token));
|
||||
bqBuilder.add(new BooleanClause(tq, Occur.MUST));
|
||||
bqBuilder.add(new BooleanClause(tq, occur));
|
||||
} else {
|
||||
WildcardQuery wq = new WildcardQuery(new Term(name(), token + "*"));
|
||||
PrefixQuery wq = new PrefixQuery(new Term(name(), token));
|
||||
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
|
||||
bqBuilder.add(new BooleanClause(wq, Occur.MUST));
|
||||
bqBuilder.add(new BooleanClause(wq, occur));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query fuzzyQuery(
|
||||
Object value,
|
||||
Fuzziness fuzziness,
|
||||
int prefixLength,
|
||||
int maxExpansions,
|
||||
boolean transpositions,
|
||||
QueryShardContext context
|
||||
) {
|
||||
String searchTerm = BytesRefs.toString(value);
|
||||
String lowerSearchTerm = toLowerCase(searchTerm);
|
||||
try {
|
||||
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
|
||||
//The approximation query can have a prefix and any number of ngrams.
|
||||
BooleanQuery.Builder approxBuilder = new BooleanQuery.Builder();
|
||||
|
||||
String postPrefixString = lowerSearchTerm;
|
||||
|
||||
// Add all content prior to prefixLength as a MUST clause to the ngram index query
|
||||
if (prefixLength > 0) {
|
||||
Set<String> prefixTokens = new LinkedHashSet<>();
|
||||
postPrefixString = lowerSearchTerm.substring(prefixLength);
|
||||
String prefixCandidate = TOKEN_START_OR_END_CHAR + lowerSearchTerm.substring(0, prefixLength);
|
||||
getNgramTokens(prefixTokens, prefixCandidate);
|
||||
for (String prefixToken : prefixTokens) {
|
||||
addClause(prefixToken, approxBuilder, Occur.MUST);
|
||||
}
|
||||
}
|
||||
// Tokenize all content after the prefix
|
||||
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), postPrefixString);
|
||||
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
|
||||
ArrayList<String> postPrefixTokens = new ArrayList<>();
|
||||
String firstToken = null;
|
||||
tokenizer.reset();
|
||||
int tokenNumber = 0;
|
||||
while (tokenizer.incrementToken()) {
|
||||
if (tokenNumber == 0) {
|
||||
String token = termAtt.toString();
|
||||
if (firstToken == null) {
|
||||
firstToken = token;
|
||||
}
|
||||
postPrefixTokens.add(token);
|
||||
}
|
||||
// Take every 3rd ngram so they are all disjoint. Our calculation for min_should_match
|
||||
// number relies on there being no overlaps
|
||||
tokenNumber++;
|
||||
if (tokenNumber == 3) {
|
||||
tokenNumber = 0;
|
||||
}
|
||||
}
|
||||
tokenizer.end();
|
||||
tokenizer.close();
|
||||
|
||||
BooleanQuery.Builder ngramBuilder = new BooleanQuery.Builder();
|
||||
int numClauses = 0;
|
||||
for (String token : postPrefixTokens) {
|
||||
addClause(token, ngramBuilder, Occur.SHOULD);
|
||||
numClauses++;
|
||||
}
|
||||
|
||||
// Approximation query
|
||||
if (numClauses > fuzziness.asDistance(searchTerm)) {
|
||||
// Useful accelerant - set min should match based on number of permitted edits.
|
||||
ngramBuilder.setMinimumNumberShouldMatch(numClauses - fuzziness.asDistance(searchTerm));
|
||||
approxBuilder.add(ngramBuilder.build(), Occur.MUST);
|
||||
}
|
||||
|
||||
BooleanQuery ngramQ = approxBuilder.build();
|
||||
if (ngramQ.clauses().size()>0) {
|
||||
bqBuilder.add(ngramQ, Occur.MUST);
|
||||
}
|
||||
|
||||
Supplier <Automaton> deferredAutomatonSupplier = ()->{
|
||||
// Verification query
|
||||
FuzzyQuery fq = new FuzzyQuery(
|
||||
new Term(name(), searchTerm),
|
||||
fuzziness.asDistance(searchTerm),
|
||||
prefixLength,
|
||||
maxExpansions,
|
||||
transpositions
|
||||
);
|
||||
return fq.getAutomata().automaton;
|
||||
};
|
||||
bqBuilder.add(new AutomatonQueryOnBinaryDv(name(), searchTerm, deferredAutomatonSupplier), Occur.MUST);
|
||||
|
||||
return bqBuilder.build();
|
||||
} catch (IOException ioe) {
|
||||
throw new ElasticsearchParseException("Error parsing wildcard field fuzzy string [" + searchTerm + "]");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -568,7 +900,10 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
if (value == null || value.length() > ignoreAbove) {
|
||||
return;
|
||||
}
|
||||
String ngramValue = TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
|
||||
// Always lower case the ngram index and value - helps with
|
||||
// a) speed (less ngram variations to explore on disk and in RAM-based automaton) and
|
||||
// b) uses less disk space
|
||||
String ngramValue = addLineEndChars(WildcardFieldType.toLowerCase(value));
|
||||
Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType);
|
||||
fields.add(ngramField);
|
||||
|
||||
|
@ -581,6 +916,11 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
}
|
||||
}
|
||||
|
||||
// Values held in the ngram index are encoded with special characters to denote start and end of values.
|
||||
static String addLineEndChars(String value) {
|
||||
return TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String contentType() {
|
||||
return CONTENT_TYPE;
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
package org.elasticsearch.xpack.wildcard.mapper;
|
||||
|
||||
import org.apache.lucene.analysis.core.KeywordAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||
|
@ -15,20 +16,31 @@ import org.apache.lucene.index.IndexWriterConfig;
|
|||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queryparser.classic.ParseException;
|
||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DocValuesFieldExistsQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.MatchNoDocsQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetadata;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.unit.Fuzziness;
|
||||
import org.elasticsearch.common.util.BigArrays;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
@ -55,12 +67,22 @@ import java.util.HashSet;
|
|||
import java.util.function.BiFunction;
|
||||
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
public class WildcardFieldMapperTests extends ESTestCase {
|
||||
|
||||
static QueryShardContext createMockQueryShardContext(boolean allowExpensiveQueries) {
|
||||
QueryShardContext queryShardContext = mock(QueryShardContext.class);
|
||||
when(queryShardContext.allowExpensiveQueries()).thenReturn(allowExpensiveQueries);
|
||||
return queryShardContext;
|
||||
}
|
||||
|
||||
private static final String KEYWORD_FIELD_NAME = "keyword_field";
|
||||
private static final String WILDCARD_FIELD_NAME = "wildcard_field";
|
||||
static final int MAX_FIELD_LENGTH = 100;
|
||||
public static final QueryShardContext MOCK_QSC = createMockQueryShardContext(true);
|
||||
|
||||
static final int MAX_FIELD_LENGTH = 30;
|
||||
static WildcardFieldMapper wildcardFieldType;
|
||||
static KeywordFieldMapper keywordFieldType;
|
||||
|
||||
|
@ -136,11 +158,18 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
IndexSearcher searcher = newSearcher(reader);
|
||||
iw.close();
|
||||
|
||||
// Test wildcard query
|
||||
String queryString = randomABString((BooleanQuery.getMaxClauseCount() * 2) + 1);
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(queryString, null, null);
|
||||
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
|
||||
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
|
||||
|
||||
// Test regexp query
|
||||
wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(queryString, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||
wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
|
||||
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
|
||||
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
@ -181,15 +210,59 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
|
||||
int numSearches = 100;
|
||||
for (int i = 0; i < numSearches; i++) {
|
||||
String randomWildcardPattern = getRandomWildcardPattern();
|
||||
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(randomWildcardPattern, null, null);
|
||||
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.INDEXORDER);
|
||||
Query wildcardFieldQuery = null;
|
||||
Query keywordFieldQuery = null;
|
||||
String pattern = null;
|
||||
switch (randomInt(3)) {
|
||||
case 0:
|
||||
pattern = getRandomWildcardPattern();
|
||||
wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
||||
keywordFieldQuery = keywordFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
||||
break;
|
||||
case 1:
|
||||
pattern = getRandomRegexPattern(values);
|
||||
wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(pattern, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||
keywordFieldQuery = keywordFieldType.fieldType().regexpQuery(pattern, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||
break;
|
||||
case 2:
|
||||
pattern = randomABString(5);
|
||||
wildcardFieldQuery = wildcardFieldType.fieldType().prefixQuery(pattern, null, MOCK_QSC);
|
||||
keywordFieldQuery = keywordFieldType.fieldType().prefixQuery(pattern, null, MOCK_QSC);
|
||||
break;
|
||||
case 3:
|
||||
int edits = randomInt(2);
|
||||
int prefixLength = randomInt(4);
|
||||
pattern = getRandomFuzzyPattern(values, edits, prefixLength);
|
||||
Fuzziness fuzziness = Fuzziness.AUTO;
|
||||
switch (edits) {
|
||||
case 0:
|
||||
fuzziness = Fuzziness.ZERO;
|
||||
break;
|
||||
case 1:
|
||||
fuzziness = Fuzziness.ONE;
|
||||
break;
|
||||
case 2:
|
||||
fuzziness = Fuzziness.TWO;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
// Prefix length shouldn't be longer than selected search string
|
||||
// BUT keyword field has a bug with prefix length when equal - see https://github.com/elastic/elasticsearch/issues/55790
|
||||
// so we opt for one less
|
||||
prefixLength = Math.min(pattern.length() - 1 , prefixLength);
|
||||
boolean transpositions = randomBoolean();
|
||||
|
||||
Query keywordFieldQuery = new WildcardQuery(new Term(KEYWORD_FIELD_NAME, randomWildcardPattern));
|
||||
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.INDEXORDER);
|
||||
|
||||
assertThat(kwTopDocs.totalHits.value, equalTo(wildcardFieldTopDocs.totalHits.value));
|
||||
wildcardFieldQuery = wildcardFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50,
|
||||
transpositions, MOCK_QSC);
|
||||
keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50,
|
||||
transpositions, MOCK_QSC);
|
||||
break;
|
||||
}
|
||||
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE);
|
||||
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE);
|
||||
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value));
|
||||
|
||||
HashSet<Integer> expectedDocs = new HashSet<>();
|
||||
for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
|
||||
|
@ -201,7 +274,6 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
assertThat(expectedDocs.size(), equalTo(0));
|
||||
}
|
||||
|
||||
|
||||
//Test keyword and wildcard sort operations are also equivalent
|
||||
QueryShardContext shardContextMock = createMockShardContext();
|
||||
|
||||
|
@ -222,7 +294,333 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
public void testRegexAcceleration() throws IOException, ParseException {
|
||||
// All these expressions should rewrite to a match all with no verification step required at all
|
||||
String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"};
|
||||
for (String regex : superfastRegexes) {
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||
assertTrue(wildcardFieldQuery instanceof DocValuesFieldExistsQuery);
|
||||
}
|
||||
String matchNoDocsRegexes[]= { ""};
|
||||
for (String regex : matchNoDocsRegexes) {
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||
assertTrue(wildcardFieldQuery instanceof MatchNoDocsQuery);
|
||||
}
|
||||
|
||||
// All of these regexes should be accelerated as the equivalent of the given QueryString query
|
||||
String acceleratedTests[][] = {
|
||||
{".*foo.*", "foo"},
|
||||
{"..foobar","+foo +oba +ar_ +r__"},
|
||||
{"(maynotexist)?foobar","+foo +oba +ar_ +r__"},
|
||||
{".*/etc/passw.*", "+\\/et +tc\\/ +\\/pa +ass +ssw"},
|
||||
{".*etc/passwd", "+etc +c\\/p +pas +ssw +wd_ +d__"},
|
||||
{"(http|ftp)://foo.*", "+((+htt +ttp) ftp) +(+\\:\\/\\/ +\\/fo +foo)"},
|
||||
{"[Pp][Oo][Ww][Ee][Rr][Ss][Hh][Ee][Ll][Ll]\\.[Ee][Xx][Ee]", "+_po +owe +ers +she +ell +l\\.e +exe +e__"},
|
||||
{"foo<1-100>bar", "+(+_fo +foo) +(+bar +r__ )"},
|
||||
{"(aaa.+&.+bbb)cat", "+cat +t__"},
|
||||
{".a", "a__"}
|
||||
};
|
||||
for (String[] test : acceleratedTests) {
|
||||
String regex = test[0];
|
||||
String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||
testExpectedAccelerationQuery(regex, wildcardFieldQuery, expectedAccelerationQueryString);
|
||||
}
|
||||
|
||||
// All these expressions should rewrite to just the verification query (there's no ngram acceleration)
|
||||
// TODO we can possibly improve on some of these
|
||||
String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb"};
|
||||
for (String regex : matchAllButVerifyTests) {
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||
assertTrue(regex +" was not a pure verify query " +formatQuery(wildcardFieldQuery),
|
||||
wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Documentation - regexes that do try accelerate but we would like to improve in future versions.
|
||||
String suboptimalTests[][] = {
|
||||
// TODO short wildcards like a* OR b* aren't great so we just drop them.
|
||||
// Ideally we would attach to successors to create (acd OR bcd)
|
||||
{ "[ab]cd", "+cd_ +d__"}
|
||||
};
|
||||
for (String[] test : suboptimalTests) {
|
||||
String regex = test[0];
|
||||
String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||
|
||||
testExpectedAccelerationQuery(regex, wildcardFieldQuery, expectedAccelerationQueryString);
|
||||
}
|
||||
|
||||
}
|
||||
// Make error messages more readable
|
||||
String formatQuery(Query q) {
|
||||
return q.toString().replaceAll(WILDCARD_FIELD_NAME+":", "").replaceAll(WildcardFieldMapper.TOKEN_START_STRING, "_");
|
||||
}
|
||||
|
||||
public void testWildcardAcceleration() throws IOException, ParseException {
|
||||
|
||||
// All these expressions should rewrite to MatchAll with no verification step required at all
|
||||
String superfastPattern[] = { "*", "**", "*?" };
|
||||
for (String pattern : superfastPattern) {
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
||||
assertTrue(
|
||||
pattern + " was not a pure match all query " + formatQuery(wildcardFieldQuery),
|
||||
wildcardFieldQuery instanceof DocValuesFieldExistsQuery
|
||||
);
|
||||
}
|
||||
|
||||
// All of these patterns should be accelerated.
|
||||
String tests[][] = {
|
||||
{ "*foobar", "+foo +oba +ar_ +r__" },
|
||||
{ "foobar*", "+_fo +oob +bar" },
|
||||
{ "foo\\*bar*", "+_fo +oo\\* +\\*ba +bar" },
|
||||
{ "foo\\?bar*", "+_fo +oo\\? +\\?ba +bar" },
|
||||
{ "foo*bar", "+_fo +foo +bar +r__" },
|
||||
{ "foo?bar", "+_fo +foo +bar +r__" },
|
||||
{ "?foo*bar?", "+foo +bar" },
|
||||
{ "*c", "+c__" } };
|
||||
for (String[] test : tests) {
|
||||
String pattern = test[0];
|
||||
String expectedAccelerationQueryString = test[1].replaceAll("_", "" + WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
||||
testExpectedAccelerationQuery(pattern, wildcardFieldQuery, expectedAccelerationQueryString);
|
||||
assertTrue(wildcardFieldQuery instanceof BooleanQuery);
|
||||
}
|
||||
|
||||
// TODO All these expressions have no acceleration at all and could be improved
|
||||
String slowPatterns[] = { "??" };
|
||||
for (String pattern : slowPatterns) {
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
||||
assertTrue(
|
||||
pattern + " was not as slow as we assumed " + formatQuery(wildcardFieldQuery),
|
||||
wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static class FuzzyTest {
|
||||
String pattern;
|
||||
int prefixLength;
|
||||
Fuzziness fuzziness;
|
||||
String expectedPrefixQuery;
|
||||
int expectedMinShouldMatch;
|
||||
String ngrams;
|
||||
|
||||
FuzzyTest(
|
||||
String pattern,
|
||||
int prefixLength,
|
||||
Fuzziness fuzziness,
|
||||
String expectedPrefixQuery,
|
||||
int expectedMinShouldMatch,
|
||||
String ngrams
|
||||
) {
|
||||
super();
|
||||
this.pattern = pattern;
|
||||
this.prefixLength = prefixLength;
|
||||
this.fuzziness = fuzziness;
|
||||
this.expectedPrefixQuery = expectedPrefixQuery;
|
||||
this.expectedMinShouldMatch = expectedMinShouldMatch;
|
||||
this.ngrams = ngrams;
|
||||
}
|
||||
|
||||
Query getFuzzyQuery() {
|
||||
return wildcardFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50, true, MOCK_QSC);
|
||||
}
|
||||
|
||||
Query getExpectedApproxQuery() throws ParseException {
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
if (expectedPrefixQuery != null) {
|
||||
String[] tokens = expectedPrefixQuery.split(" ");
|
||||
Query prefixQuery = null;
|
||||
if (tokens.length == 1) {
|
||||
prefixQuery = new TermQuery(
|
||||
new Term(WILDCARD_FIELD_NAME, tokens[0].replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
|
||||
);
|
||||
} else {
|
||||
BooleanQuery.Builder pqb = new BooleanQuery.Builder();
|
||||
for (String token : tokens) {
|
||||
Query ngramQuery = new TermQuery(
|
||||
new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
|
||||
);
|
||||
pqb.add(ngramQuery, Occur.MUST);
|
||||
}
|
||||
prefixQuery = pqb.build();
|
||||
}
|
||||
|
||||
if (ngrams == null) {
|
||||
return prefixQuery;
|
||||
}
|
||||
bq.add(prefixQuery, Occur.MUST);
|
||||
}
|
||||
|
||||
if (ngrams != null) {
|
||||
BooleanQuery.Builder nq = new BooleanQuery.Builder();
|
||||
String[] tokens = ngrams.split(" ");
|
||||
for (String token : tokens) {
|
||||
Query ngramQuery = new TermQuery(
|
||||
new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
|
||||
);
|
||||
nq.add(ngramQuery, Occur.SHOULD);
|
||||
}
|
||||
nq.setMinimumNumberShouldMatch(expectedMinShouldMatch);
|
||||
bq.add(nq.build(), Occur.MUST);
|
||||
}
|
||||
return bq.build();
|
||||
}
|
||||
}
|
||||
|
||||
public void testFuzzyAcceleration() throws IOException, ParseException {
|
||||
|
||||
FuzzyTest[] tests = {
|
||||
new FuzzyTest("123456", 0, Fuzziness.ONE, null, 1, "123 456"),
|
||||
new FuzzyTest("1234567890", 2, Fuzziness.ONE, "_12", 1, "345 678"),
|
||||
new FuzzyTest("12345678901", 2, Fuzziness.ONE, "_12", 2, "345 678 901"),
|
||||
new FuzzyTest("12345678", 4, Fuzziness.ONE, "_12 234", 0, null)
|
||||
};
|
||||
for (FuzzyTest test : tests) {
|
||||
Query wildcardFieldQuery = test.getFuzzyQuery();
|
||||
testExpectedAccelerationQuery(test.pattern, wildcardFieldQuery, test.getExpectedApproxQuery());
|
||||
}
|
||||
}
|
||||
|
||||
void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {
|
||||
|
||||
QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
|
||||
Query expectedAccelerationQuery = qsp.parse(expectedAccelerationQueryString);
|
||||
testExpectedAccelerationQuery(regex, combinedQuery, expectedAccelerationQuery);
|
||||
}
|
||||
void testExpectedAccelerationQuery(String regex, Query combinedQuery, Query expectedAccelerationQuery) throws ParseException {
|
||||
BooleanQuery cq = (BooleanQuery) combinedQuery;
|
||||
assert cq.clauses().size() == 2;
|
||||
Query approximationQuery = null;
|
||||
boolean verifyQueryFound = false;
|
||||
for (BooleanClause booleanClause : cq.clauses()) {
|
||||
Query q = booleanClause.getQuery();
|
||||
if (q instanceof AutomatonQueryOnBinaryDv) {
|
||||
verifyQueryFound = true;
|
||||
} else {
|
||||
approximationQuery = q;
|
||||
}
|
||||
}
|
||||
assert verifyQueryFound;
|
||||
|
||||
String message = "regex: "+ regex +"\nactual query: " + formatQuery(approximationQuery) +
|
||||
"\nexpected query: " + formatQuery(expectedAccelerationQuery) + "\n";
|
||||
assertEquals(message, expectedAccelerationQuery, approximationQuery);
|
||||
}
|
||||
|
||||
private String getRandomFuzzyPattern(HashSet<String> values, int edits, int prefixLength) {
|
||||
assert edits >=0 && edits <=2;
|
||||
// Pick one of the indexed document values to focus our queries on.
|
||||
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
|
||||
|
||||
if (edits == 0) {
|
||||
return randomValue;
|
||||
}
|
||||
|
||||
if (randomValue.length() > prefixLength) {
|
||||
randomValue = randomValue.substring(0,prefixLength) + "C" + randomValue.substring(prefixLength);
|
||||
edits--;
|
||||
}
|
||||
|
||||
if(edits > 0) {
|
||||
randomValue = randomValue + "a";
|
||||
}
|
||||
return randomValue;
|
||||
}
|
||||
|
||||
private String getRandomRegexPattern(HashSet<String> values) {
|
||||
// Pick one of the indexed document values to focus our queries on.
|
||||
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
|
||||
return convertToRandomRegex(randomValue);
|
||||
}
|
||||
|
||||
// Produces a random regex string guaranteed to match the provided value
|
||||
protected String convertToRandomRegex(String randomValue) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
//Pick a part of the string to change
|
||||
int substitutionPoint = randomIntBetween(0, randomValue.length()-1);
|
||||
int substitutionLength = randomIntBetween(1, Math.min(10, randomValue.length() - substitutionPoint));
|
||||
|
||||
//Add any head to the result, unchanged
|
||||
if(substitutionPoint >0) {
|
||||
result.append(randomValue.substring(0,substitutionPoint));
|
||||
}
|
||||
|
||||
// Modify the middle...
|
||||
String replacementPart = randomValue.substring(substitutionPoint, substitutionPoint+substitutionLength);
|
||||
int mutation = randomIntBetween(0, 11);
|
||||
switch (mutation) {
|
||||
case 0:
|
||||
// OR with random alpha of same length
|
||||
result.append("("+replacementPart+"|c"+ randomABString(replacementPart.length())+")");
|
||||
break;
|
||||
case 1:
|
||||
// OR with non-existant value
|
||||
result.append("("+replacementPart+"|doesnotexist)");
|
||||
break;
|
||||
case 2:
|
||||
// OR with another randomised regex (used to create nested levels of expression).
|
||||
result.append("(" + convertToRandomRegex(replacementPart) +"|doesnotexist)");
|
||||
break;
|
||||
case 3:
|
||||
// Star-replace all ab sequences.
|
||||
result.append(replacementPart.replaceAll("ab", ".*"));
|
||||
break;
|
||||
case 4:
|
||||
// .-replace all b chars
|
||||
result.append(replacementPart.replaceAll("b", "."));
|
||||
break;
|
||||
case 5:
|
||||
// length-limited stars {1,2}
|
||||
result.append(".{1,"+replacementPart.length()+"}");
|
||||
break;
|
||||
case 6:
|
||||
// replace all chars with .
|
||||
result.append(replacementPart.replaceAll(".", "."));
|
||||
break;
|
||||
case 7:
|
||||
// OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
|
||||
char [] chars = replacementPart.toCharArray();
|
||||
for (char c : chars) {
|
||||
result.append("[" + c + Character.toUpperCase(c) +"]");
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
// NOT a character - replace all b's with "not a"
|
||||
result.append(replacementPart.replaceAll("b", "[^a]"));
|
||||
break;
|
||||
case 9:
|
||||
// Make whole part repeatable 1 or more times
|
||||
result.append("(" + replacementPart +")+");
|
||||
break;
|
||||
case 10:
|
||||
// Make whole part repeatable 0 or more times
|
||||
result.append("(" + replacementPart +")?");
|
||||
break;
|
||||
case 11:
|
||||
// all but ... syntax
|
||||
result.append("@&~(doesnotexist.+)");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
//add any remaining tail, unchanged
|
||||
if(substitutionPoint + substitutionLength <= randomValue.length()-1) {
|
||||
result.append(randomValue.substring(substitutionPoint + substitutionLength));
|
||||
}
|
||||
|
||||
//Assert our randomly generated regex actually matches the provided raw input.
|
||||
RegExp regex = new RegExp(result.toString());
|
||||
Automaton automaton = regex.toAutomaton();
|
||||
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
|
||||
BytesRef br = new BytesRef(randomValue);
|
||||
assertTrue("[" + result.toString() + "]should match [" + randomValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
|
||||
+ randomValue.length(), bytesMatcher.run(br.bytes, br.offset, br.length));
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
protected MappedFieldType provideMappedFieldType(String name) {
|
||||
if (name.equals(WILDCARD_FIELD_NAME)) {
|
||||
|
@ -283,8 +681,12 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
static String randomABString(int minLength) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
while (sb.length() < minLength) {
|
||||
if (randomBoolean()) {
|
||||
if (randomBoolean()) {
|
||||
sb.append("a");
|
||||
} else {
|
||||
sb.append("A");
|
||||
}
|
||||
} else {
|
||||
sb.append("b");
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue