Add regex query support to wildcard field (approach 2) (#55548) (#57141)

Backport of #55548

Adds equivalence for keyword field to the wildcard field. Regex, fuzzy, wildcard and prefix queries are all supported.
All queries use an approximation query backed by an automaton-based verification queries.

Closes #54275
This commit is contained in:
markharwood 2020-05-26 16:55:59 +01:00 committed by GitHub
parent 9f1e3bc82b
commit b2bc6071fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 1005 additions and 208 deletions

View File

@ -1,8 +1,8 @@
setup: setup:
- skip: - skip:
features: headers features: headers
version: " - 7.7.99" version: " - 7.8.99"
reason: "wildcard fields were added from 7.8" reason: "wildcard fields were added from 7.9"
- do: - do:
indices.create: indices.create:

View File

@ -24,6 +24,7 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
import java.io.IOException; import java.io.IOException;
import java.util.Objects; import java.util.Objects;
import java.util.function.Supplier;
/** /**
* Query that runs an Automaton across all binary doc values. * Query that runs an Automaton across all binary doc values.
@ -33,18 +34,19 @@ public class AutomatonQueryOnBinaryDv extends Query {
private final String field; private final String field;
private final String matchPattern; private final String matchPattern;
private final Automaton automaton; private final Supplier<Automaton> automatonSupplier;
public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton) { public AutomatonQueryOnBinaryDv(String field, String matchPattern, Supplier<Automaton> automatonSupplier) {
this.field = field; this.field = field;
this.matchPattern = matchPattern; this.matchPattern = matchPattern;
this.automaton = automaton; this.automatonSupplier = automatonSupplier;
} }
@Override @Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automatonSupplier.get());
return new ConstantScoreWeight(this, boost) { return new ConstantScoreWeight(this, boost) {
@ -92,6 +94,9 @@ public class AutomatonQueryOnBinaryDv extends Query {
@Override @Override
public boolean equals(Object obj) { public boolean equals(Object obj) {
if (obj == null || obj.getClass() != getClass()) {
return false;
}
AutomatonQueryOnBinaryDv other = (AutomatonQueryOnBinaryDv) obj; AutomatonQueryOnBinaryDv other = (AutomatonQueryOnBinaryDv) obj;
return Objects.equals(field, other.field) && Objects.equals(matchPattern, other.matchPattern); return Objects.equals(field, other.field) && Objects.equals(matchPattern, other.matchPattern);
} }

View File

@ -0,0 +1,50 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.wildcard.mapper;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import java.io.IOException;
/**
* A query that matches all documents. The class is more of a marker
* that we encountered something that will need verification.
* (A MatchAllDocs query is used to indicate we can match all
* _without_ verification)
*/
public final class MatchAllButRequireVerificationQuery extends Query {
@Override
public Query rewrite(IndexReader reader) throws IOException {
return new MatchAllDocsQuery();
}
@Override
public String toString(String field) {
return "*:* (tbc)";
}
@Override
public boolean equals(Object o) {
return sameClassAs(o);
}
@Override
public int hashCode() {
return classHash();
}
@Override
public void visit(QueryVisitor visitor) {
visitor.visitLeaf(this);
}
}

View File

@ -21,24 +21,32 @@ import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DocValuesFieldExistsQuery; import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.MultiTermQuery.RewriteMethod; import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.automaton.RegExp.Kind;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.support.XContentMapValues; import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.index.Index; import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AnalyzerScope; import org.elasticsearch.index.analysis.AnalyzerScope;
import org.elasticsearch.index.analysis.LowercaseNormalizer;
import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested; import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
@ -63,11 +71,16 @@ import org.elasticsearch.search.aggregations.support.ValuesSourceType;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.function.Supplier;
import static org.elasticsearch.index.mapper.TypeParsers.parseField; import static org.elasticsearch.index.mapper.TypeParsers.parseField;
import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;
/** /**
* A {@link FieldMapper} for indexing fields with ngrams for efficient wildcard matching * A {@link FieldMapper} for indexing fields with ngrams for efficient wildcard matching
@ -206,9 +219,13 @@ public class WildcardFieldMapper extends FieldMapper {
} }
public static final char TOKEN_START_OR_END_CHAR = 0; public static final char TOKEN_START_OR_END_CHAR = 0;
public static final String TOKEN_START_STRING = Character.toString(TOKEN_START_OR_END_CHAR);
public static final String TOKEN_END_STRING = TOKEN_START_STRING + TOKEN_START_STRING;
public static final class WildcardFieldType extends MappedFieldType { public static final class WildcardFieldType extends MappedFieldType {
static Analyzer lowercaseNormalizer = new LowercaseNormalizer();
public WildcardFieldType() { public WildcardFieldType() {
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
setSearchAnalyzer(Lucene.KEYWORD_ANALYZER); setSearchAnalyzer(Lucene.KEYWORD_ANALYZER);
@ -223,218 +240,533 @@ public class WildcardFieldMapper extends FieldMapper {
return result; return result;
} }
// Holds parsed information about the wildcard pattern
static class PatternStructure {
boolean openStart, openEnd, hasSymbols;
int lastGap =0;
int wildcardCharCount, wildcardStringCount;
String[] fragments;
Integer [] precedingGapSizes;
final String pattern;
@SuppressWarnings("fallthrough") // Intentionally uses fallthrough mirroring implementation in Lucene's WildcardQuery
PatternStructure (String wildcardText) {
this.pattern = wildcardText;
ArrayList<String> fragmentList = new ArrayList<>();
ArrayList<Integer> precedingGapSizeList = new ArrayList<>();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < wildcardText.length();) {
final int c = wildcardText.codePointAt(i);
int length = Character.charCount(c);
switch (c) {
case WildcardQuery.WILDCARD_STRING:
if (i == 0) {
openStart = true;
}
openEnd = true;
hasSymbols = true;
wildcardStringCount++;
if (sb.length() > 0) {
precedingGapSizeList.add(lastGap);
fragmentList.add(sb.toString());
sb = new StringBuilder();
}
lastGap = Integer.MAX_VALUE;
break;
case WildcardQuery.WILDCARD_CHAR:
if (i == 0) {
openStart = true;
}
hasSymbols = true;
wildcardCharCount++;
openEnd = true;
if (sb.length() > 0) {
precedingGapSizeList.add(lastGap);
fragmentList.add(sb.toString());
sb = new StringBuilder();
lastGap = 0;
}
if (lastGap != Integer.MAX_VALUE) {
lastGap++;
}
break;
case WildcardQuery.WILDCARD_ESCAPE:
// add the next codepoint instead, if it exists
if (i + length < wildcardText.length()) {
final int nextChar = wildcardText.codePointAt(i + length);
length += Character.charCount(nextChar);
sb.append(Character.toChars(nextChar));
openEnd = false;
break;
} // else fallthru, lenient parsing with a trailing \
default:
openEnd = false;
sb.append(Character.toChars(c));
}
i += length;
}
if (sb.length() > 0) {
precedingGapSizeList.add(lastGap);
fragmentList.add(sb.toString());
lastGap = 0;
}
fragments = fragmentList.toArray(new String[0]);
precedingGapSizes = precedingGapSizeList.toArray(new Integer[0]);
}
public boolean needsVerification() {
// Return true if term queries are not enough evidence
if (fragments.length == 1 && wildcardCharCount == 0) {
// The one case where we don't need verification is when
// we have a single fragment and no ? characters
return false;
}
return true;
}
// Returns number of positions for last gap (Integer.MAX means unlimited gap)
public int getPrecedingGapSize(int fragmentNum) {
return precedingGapSizes[fragmentNum];
}
public boolean isMatchAll() {
return fragments.length == 0 && wildcardStringCount >0 && wildcardCharCount ==0;
}
@Override
public int hashCode() {
return pattern.hashCode();
}
@Override
public boolean equals(Object obj) {
PatternStructure other = (PatternStructure) obj;
return pattern.equals(other.pattern);
}
}
@Override @Override
public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) { public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) {
PatternStructure patternStructure = new PatternStructure(wildcardPattern);
ArrayList<String> tokens = new ArrayList<>();
for (int i = 0; i < patternStructure.fragments.length; i++) { String ngramIndexPattern = addLineEndChars(toLowerCase(wildcardPattern));
String fragment = patternStructure.fragments[i];
int fLength = fragment.length();
if (fLength == 0) {
continue;
}
// Add any start/end of string character // Break search term into tokens
if (i == 0 && patternStructure.openStart == false) { Set<String> tokens = new LinkedHashSet<>();
// Start-of-string anchored (is not a leading wildcard) StringBuilder sequence = new StringBuilder();
fragment = TOKEN_START_OR_END_CHAR + fragment; int numWildcardChars = 0;
int numWildcardStrings = 0;
for (int i = 0; i < ngramIndexPattern.length();) {
final int c = ngramIndexPattern.codePointAt(i);
int length = Character.charCount(c);
switch (c) {
case WildcardQuery.WILDCARD_STRING:
if (sequence.length() > 0) {
getNgramTokens(tokens, sequence.toString());
sequence = new StringBuilder();
}
numWildcardStrings++;
break;
case WildcardQuery.WILDCARD_CHAR:
if (sequence.length() > 0) {
getNgramTokens(tokens, sequence.toString());
sequence = new StringBuilder();
}
numWildcardChars++;
break;
case WildcardQuery.WILDCARD_ESCAPE:
// add the next codepoint instead, if it exists
if (i + length < ngramIndexPattern.length()) {
final int nextChar = ngramIndexPattern.codePointAt(i + length);
length += Character.charCount(nextChar);
sequence.append(Character.toChars(nextChar));
} else {
sequence.append(Character.toChars(c));
}
break;
default:
sequence.append(Character.toChars(c));
} }
if (patternStructure.openEnd == false && i == patternStructure.fragments.length - 1) { i += length;
// End-of-string anchored (is not a trailing wildcard) }
fragment = fragment + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
if (sequence.length() > 0) {
getNgramTokens(tokens, sequence.toString());
}
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
int clauseCount = 0;
for (String string : tokens) {
if (clauseCount >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
break;
} }
if (fragment.codePointCount(0, fragment.length()) <= NGRAM_SIZE) { addClause(string, rewritten, Occur.MUST);
tokens.add(fragment); clauseCount++;
}
Supplier<Automaton> deferredAutomatonSupplier = () -> {
return WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
};
AutomatonQueryOnBinaryDv verifyingQuery = new AutomatonQueryOnBinaryDv(name(), wildcardPattern, deferredAutomatonSupplier);
if (clauseCount > 0) {
// We can accelerate execution with the ngram query
BooleanQuery approxQuery = rewritten.build();
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
verifyingBuilder.add(new BooleanClause(approxQuery, Occur.MUST));
verifyingBuilder.add(new BooleanClause(verifyingQuery, Occur.MUST));
return verifyingBuilder.build();
} else if (numWildcardChars == 0 || numWildcardStrings > 0) {
// We have no concrete characters and we're not a pure length query e.g. ???
return new DocValuesFieldExistsQuery(name());
}
return verifyingQuery;
}
@Override
public Query regexpQuery(String value, int flags, int maxDeterminizedStates, RewriteMethod method, QueryShardContext context) {
if (value.length() == 0) {
return new MatchNoDocsQuery();
}
if (context.allowExpensiveQueries() == false) {
throw new ElasticsearchException(
"[regexp] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false."
);
}
RegExp ngramRegex = new RegExp(addLineEndChars(toLowerCase(value)), flags);
Query approxBooleanQuery = toApproximationQuery(ngramRegex);
Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery);
// MatchAll is a special case meaning the regex is known to match everything .* and
// there is no need for verification.
if (approxNgramQuery instanceof MatchAllDocsQuery) {
return existsQuery(context);
}
Supplier<Automaton> deferredAutomatonSupplier = ()-> {
RegExp regex = new RegExp(value, flags);
return regex.toAutomaton(maxDeterminizedStates);
};
AutomatonQueryOnBinaryDv verifyingQuery = new AutomatonQueryOnBinaryDv(name(), value, deferredAutomatonSupplier);
// MatchAllButRequireVerificationQuery is a special case meaning the regex is reduced to a single
// clause which we can't accelerate at all and needs verification. Example would be ".."
if (approxNgramQuery instanceof MatchAllButRequireVerificationQuery) {
return verifyingQuery;
}
// We can accelerate execution with the ngram query
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
verifyingBuilder.add(new BooleanClause(approxNgramQuery, Occur.MUST));
verifyingBuilder.add(new BooleanClause(verifyingQuery, Occur.MUST));
return verifyingBuilder.build();
}
// Convert a regular expression to a simplified query consisting of BooleanQuery and TermQuery objects
// which captures as much of the logic as possible. Query can produce some false positives but shouldn't
// produce any false negatives.
// In addition to Term and BooleanQuery clauses there are MatchAllDocsQuery objects (e.g for .*) and
// a RegExpQuery if we can't resolve to any of the above.
// * If an expression resolves to a single MatchAllDocsQuery eg .* then a match all shortcut is possible with
// no verification needed.
// * If an expression resolves to a RegExpQuery eg ?? then only the verification
// query is run.
// * Anything else is a concrete query that should be run on the ngram index.
public static Query toApproximationQuery(RegExp r) throws IllegalArgumentException {
Query result = null;
switch (r.kind) {
case REGEXP_UNION:
result = createUnionQuery(r);
break;
case REGEXP_CONCATENATION:
result = createConcatenationQuery(r);
break;
case REGEXP_STRING:
String normalizedString = toLowerCase(r.s);
result = new TermQuery(new Term("", normalizedString));
break;
case REGEXP_CHAR:
String cs = new StringBuilder().appendCodePoint(r.c).toString();
String normalizedChar = toLowerCase(cs);
result = new TermQuery(new Term("", normalizedChar));
break;
case REGEXP_REPEAT:
// Repeat is zero or more times so zero matches = match all
result = new MatchAllDocsQuery();
break;
case REGEXP_REPEAT_MIN:
case REGEXP_REPEAT_MINMAX:
if (r.min > 0) {
result = toApproximationQuery(r.exp1);
if(result instanceof TermQuery) {
// Wrap the repeating expression so that it is not concatenated by a parent which concatenates
// plain TermQuery objects together. Boolean queries are interpreted as a black box and not
// concatenated.
BooleanQuery.Builder wrapper = new BooleanQuery.Builder();
wrapper.add(result, Occur.MUST);
result = wrapper.build();
}
} else {
// Expressions like (a){0,3} match empty string or up to 3 a's.
result = new MatchAllButRequireVerificationQuery();
}
break;
case REGEXP_ANYSTRING:
// optimisation for .* queries - match all and no verification stage required.
result = new MatchAllDocsQuery();
break;
// All other kinds of expression cannot be represented as a boolean or term query so return an object
// that indicates verification is required
case REGEXP_OPTIONAL:
case REGEXP_INTERSECTION:
case REGEXP_COMPLEMENT:
case REGEXP_CHAR_RANGE:
case REGEXP_ANYCHAR:
case REGEXP_INTERVAL:
case REGEXP_EMPTY:
case REGEXP_AUTOMATON:
result = new MatchAllButRequireVerificationQuery();
break;
}
assert result != null; // All regex types are understood and translated to a query.
return result;
}
private static Query createConcatenationQuery(RegExp r) {
// Create ANDs of expressions plus collapse consecutive TermQuerys into single longer ones
ArrayList<Query> queries = new ArrayList<>();
findLeaves(r.exp1, Kind.REGEXP_CONCATENATION, queries);
findLeaves(r.exp2, Kind.REGEXP_CONCATENATION, queries);
BooleanQuery.Builder bAnd = new BooleanQuery.Builder();
StringBuilder sequence = new StringBuilder();
for (Query query : queries) {
if (query instanceof TermQuery) {
TermQuery tq = (TermQuery) query;
sequence.append(tq.getTerm().text());
} else { } else {
// Break fragment into multiple Ngrams if (sequence.length() > 0) {
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment); bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST);
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); sequence = new StringBuilder();
String lastUnusedToken = null; }
try { bAnd.add(query, Occur.MUST);
tokenizer.reset(); }
boolean takeThis = true; }
// minimise number of terms searched - eg for "12345" and 3grams we only need terms if (sequence.length() > 0) {
// `123` and `345` - no need to search for 234. We take every other ngram. bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST);
while (tokenizer.incrementToken()) { }
String tokenValue = termAtt.toString(); BooleanQuery combined = bAnd.build();
if (takeThis) { if (combined.clauses().size() > 0) {
tokens.add(tokenValue); return combined;
} else { }
lastUnusedToken = tokenValue; // There's something in the regex we couldn't represent as a query - resort to a match all with verification
return new MatchAllButRequireVerificationQuery();
}
private static Query createUnionQuery(RegExp r) {
// Create an OR of clauses
ArrayList<Query> queries = new ArrayList<>();
findLeaves(r.exp1, Kind.REGEXP_UNION, queries);
findLeaves(r.exp2, Kind.REGEXP_UNION, queries);
BooleanQuery.Builder bOr = new BooleanQuery.Builder();
HashSet<Query> uniqueClauses = new HashSet<>();
for (Query query : queries) {
if (uniqueClauses.add(query)) {
bOr.add(query, Occur.SHOULD);
}
}
if (uniqueClauses.size() > 0) {
if (uniqueClauses.size() == 1) {
// Fully-understood ORs that collapse to a single term should be returned minus
// the BooleanQuery wrapper so that they might be concatenated.
// Helps turn [Pp][Oo][Ww][Ee][Rr][Ss][Hh][Ee][Ll][Ll] into "powershell"
// Each char pair eg (P OR p) can be normalized to (p) which can be a single term
return uniqueClauses.iterator().next();
} else {
return bOr.build();
}
}
// There's something in the regex we couldn't represent as a query - resort to a match all with verification
return new MatchAllButRequireVerificationQuery();
}
private static void findLeaves(RegExp exp, Kind kind, List<Query> queries) {
if (exp.kind == kind) {
findLeaves(exp.exp1, kind, queries);
findLeaves( exp.exp2, kind, queries);
} else {
queries.add(toApproximationQuery(exp));
}
}
private static String toLowerCase(String string) {
return lowercaseNormalizer.normalize(null, string).utf8ToString();
}
// Takes a BooleanQuery + TermQuery tree representing query logic and rewrites using ngrams of appropriate size.
private Query rewriteBoolToNgramQuery(Query approxQuery) {
//TODO optimise more intelligently so we:
// 1) favour full-length term queries eg abc over short eg a* when pruning too many clauses.
// 2) make MAX_CLAUSES_IN_APPROXIMATION_QUERY a global cap rather than per-boolean clause.
if (approxQuery == null) {
return null;
}
if (approxQuery instanceof BooleanQuery) {
BooleanQuery bq = (BooleanQuery) approxQuery;
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
int clauseCount = 0;
for (BooleanClause clause : bq) {
Query q = rewriteBoolToNgramQuery(clause.getQuery());
if (q != null) {
if (clause.getOccur().equals(Occur.MUST)) {
// Can't drop "should" clauses because it can elevate a sibling optional item
// to mandatory (shoulds with 1 clause) causing false negatives
// Dropping MUSTs increase false positives which are OK because are verified anyway.
clauseCount++;
if (clauseCount >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
break;
} }
// alternate
takeThis = !takeThis;
} }
if (lastUnusedToken != null) { rewritten.add(q, clause.getOccur());
// given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing }
// `ake` to complete the logic. }
tokens.add(lastUnusedToken); return simplify(rewritten.build());
} }
tokenizer.end(); if (approxQuery instanceof TermQuery) {
tokenizer.close(); TermQuery tq = (TermQuery) approxQuery;
} catch (IOException ioe) {
throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment [" + fragment + "]"); //Remove simple terms that are only string beginnings or ends.
String s = tq.getTerm().text();
if (s.equals(WildcardFieldMapper.TOKEN_START_STRING) || s.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
return new MatchAllButRequireVerificationQuery();
}
// Break term into tokens
Set<String> tokens = new LinkedHashSet<>();
getNgramTokens(tokens, s);
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
for (String string : tokens) {
addClause(string, rewritten, Occur.MUST);
}
return simplify(rewritten.build());
}
if (isMatchAll(approxQuery)) {
return approxQuery;
}
throw new IllegalStateException("Invalid query type found parsing regex query:" + approxQuery);
}
static Query simplify(Query input) {
if (input instanceof BooleanQuery == false) {
return input;
}
BooleanQuery result = (BooleanQuery) input;
if (result.clauses().size() == 0) {
// A ".*" clause can produce zero clauses in which case we return MatchAll
return new MatchAllDocsQuery();
}
if (result.clauses().size() == 1) {
return simplify(result.clauses().get(0).getQuery());
}
// We may have a mix of MatchAll and concrete queries - assess if we can simplify
int matchAllCount = 0;
int verifyCount = 0;
boolean allConcretesAreOptional = true;
for (BooleanClause booleanClause : result.clauses()) {
Query q = booleanClause.getQuery();
if (q instanceof MatchAllDocsQuery) {
matchAllCount++;
} else if (q instanceof MatchAllButRequireVerificationQuery) {
verifyCount++;
} else {
// Concrete query
if (booleanClause.getOccur() != Occur.SHOULD) {
allConcretesAreOptional = false;
} }
} }
} }
if (patternStructure.isMatchAll()) { if ((allConcretesAreOptional && matchAllCount > 0)) {
// Any match all expression takes precedence over all optional concrete queries.
return new MatchAllDocsQuery(); return new MatchAllDocsQuery();
} }
BooleanQuery approximation = createApproximationQuery(tokens);
if (approximation.clauses().size() > 1 || patternStructure.needsVerification()) {
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST));
Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
verifyingBuilder.add(new BooleanClause(new AutomatonQueryOnBinaryDv(name(), wildcardPattern, automaton), Occur.MUST));
return verifyingBuilder.build();
}
return approximation;
}
private BooleanQuery createApproximationQuery(ArrayList<String> tokens) { if ((allConcretesAreOptional && verifyCount > 0)) {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); // Any match all expression that needs verification takes precedence over all optional concrete queries.
if (tokens.size() <= MAX_CLAUSES_IN_APPROXIMATION_QUERY) { return new MatchAllButRequireVerificationQuery();
for (String token : tokens) { }
addClause(token, bqBuilder);
// We have some mandatory concrete queries - strip out the superfluous match all expressions
if (allConcretesAreOptional == false && matchAllCount + verifyCount > 0) {
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
for (BooleanClause booleanClause : result.clauses()) {
if (isMatchAll(booleanClause.getQuery()) == false) {
rewritten.add(booleanClause);
}
} }
return bqBuilder.build(); return simplify(rewritten.build());
} }
// Thin out the number of clauses using a selection spread evenly across the range return result;
float step = (float) (tokens.size() - 1) / (float) (MAX_CLAUSES_IN_APPROXIMATION_QUERY - 1); // set step size
for (int i = 0; i < MAX_CLAUSES_IN_APPROXIMATION_QUERY; i++) {
addClause(tokens.get(Math.round(step * i)), bqBuilder); // add each element of a position which is a multiple of step
}
// TODO we can be smarter about pruning here. e.g.
// * Avoid wildcard queries if there are sufficient numbers of other terms that are full 3grams that are cheaper term queries
// * We can select terms on their scarcity rather than even spreads across the search string.
return bqBuilder.build();
} }
private void addClause(String token, BooleanQuery.Builder bqBuilder) {
assert token.codePointCount(0, token.length()) <= NGRAM_SIZE;
if (token.codePointCount(0, token.length()) == NGRAM_SIZE) {
TermQuery tq = new TermQuery(new Term(name(), token));
bqBuilder.add(new BooleanClause(tq, Occur.MUST));
} else {
WildcardQuery wq = new WildcardQuery(new Term(name(), token + "*"));
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
bqBuilder.add(new BooleanClause(wq, Occur.MUST));
}
static boolean isMatchAll(Query q) {
return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
}
protected void getNgramTokens(Set<String> tokens, String fragment) {
if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
// If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search
// terms which can be ignored.
return;
}
// Break fragment into multiple Ngrams
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment);
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
// If fragment length < NGRAM_SIZE then it is not emitted by token stream so need
// to initialise with the value here
String lastUnusedToken = fragment;
try {
tokenizer.reset();
boolean takeThis = true;
// minimise number of terms searched - eg for "12345" and 3grams we only need terms
// `123` and `345` - no need to search for 234. We take every other ngram.
while (tokenizer.incrementToken()) {
String tokenValue = termAtt.toString();
if (takeThis) {
tokens.add(tokenValue);
lastUnusedToken = null;
} else {
lastUnusedToken = tokenValue;
}
// alternate
takeThis = !takeThis;
if (tokens.size() >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
lastUnusedToken = null;
break;
}
}
if (lastUnusedToken != null) {
// given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing
// `ake` to complete the logic.
tokens.add(lastUnusedToken);
}
tokenizer.end();
tokenizer.close();
} catch (IOException ioe) {
throw new ElasticsearchParseException("Error parsing wildcard regex pattern fragment [" + fragment + "]");
}
}
private void addClause(String token, BooleanQuery.Builder bqBuilder, Occur occur) {
assert token.codePointCount(0, token.length()) <= NGRAM_SIZE;
int tokenSize = token.codePointCount(0, token.length());
if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
// there's something concrete to be searched but it's too short
// Require verification.
bqBuilder.add(new BooleanClause(new MatchAllButRequireVerificationQuery(), occur));
return;
}
if (tokenSize == NGRAM_SIZE) {
TermQuery tq = new TermQuery(new Term(name(), token));
bqBuilder.add(new BooleanClause(tq, occur));
} else {
PrefixQuery wq = new PrefixQuery(new Term(name(), token));
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
bqBuilder.add(new BooleanClause(wq, occur));
}
}
@Override
public Query fuzzyQuery(
Object value,
Fuzziness fuzziness,
int prefixLength,
int maxExpansions,
boolean transpositions,
QueryShardContext context
) {
String searchTerm = BytesRefs.toString(value);
String lowerSearchTerm = toLowerCase(searchTerm);
try {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
//The approximation query can have a prefix and any number of ngrams.
BooleanQuery.Builder approxBuilder = new BooleanQuery.Builder();
String postPrefixString = lowerSearchTerm;
// Add all content prior to prefixLength as a MUST clause to the ngram index query
if (prefixLength > 0) {
Set<String> prefixTokens = new LinkedHashSet<>();
postPrefixString = lowerSearchTerm.substring(prefixLength);
String prefixCandidate = TOKEN_START_OR_END_CHAR + lowerSearchTerm.substring(0, prefixLength);
getNgramTokens(prefixTokens, prefixCandidate);
for (String prefixToken : prefixTokens) {
addClause(prefixToken, approxBuilder, Occur.MUST);
}
}
// Tokenize all content after the prefix
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), postPrefixString);
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
ArrayList<String> postPrefixTokens = new ArrayList<>();
String firstToken = null;
tokenizer.reset();
int tokenNumber = 0;
while (tokenizer.incrementToken()) {
if (tokenNumber == 0) {
String token = termAtt.toString();
if (firstToken == null) {
firstToken = token;
}
postPrefixTokens.add(token);
}
// Take every 3rd ngram so they are all disjoint. Our calculation for min_should_match
// number relies on there being no overlaps
tokenNumber++;
if (tokenNumber == 3) {
tokenNumber = 0;
}
}
tokenizer.end();
tokenizer.close();
BooleanQuery.Builder ngramBuilder = new BooleanQuery.Builder();
int numClauses = 0;
for (String token : postPrefixTokens) {
addClause(token, ngramBuilder, Occur.SHOULD);
numClauses++;
}
// Approximation query
if (numClauses > fuzziness.asDistance(searchTerm)) {
// Useful accelerant - set min should match based on number of permitted edits.
ngramBuilder.setMinimumNumberShouldMatch(numClauses - fuzziness.asDistance(searchTerm));
approxBuilder.add(ngramBuilder.build(), Occur.MUST);
}
BooleanQuery ngramQ = approxBuilder.build();
if (ngramQ.clauses().size()>0) {
bqBuilder.add(ngramQ, Occur.MUST);
}
Supplier <Automaton> deferredAutomatonSupplier = ()->{
// Verification query
FuzzyQuery fq = new FuzzyQuery(
new Term(name(), searchTerm),
fuzziness.asDistance(searchTerm),
prefixLength,
maxExpansions,
transpositions
);
return fq.getAutomata().automaton;
};
bqBuilder.add(new AutomatonQueryOnBinaryDv(name(), searchTerm, deferredAutomatonSupplier), Occur.MUST);
return bqBuilder.build();
} catch (IOException ioe) {
throw new ElasticsearchParseException("Error parsing wildcard field fuzzy string [" + searchTerm + "]");
}
} }
@Override @Override
@ -568,7 +900,10 @@ public class WildcardFieldMapper extends FieldMapper {
if (value == null || value.length() > ignoreAbove) { if (value == null || value.length() > ignoreAbove) {
return; return;
} }
String ngramValue = TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR; // Always lower case the ngram index and value - helps with
// a) speed (less ngram variations to explore on disk and in RAM-based automaton) and
// b) uses less disk space
String ngramValue = addLineEndChars(WildcardFieldType.toLowerCase(value));
Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType); Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType);
fields.add(ngramField); fields.add(ngramField);
@ -581,6 +916,11 @@ public class WildcardFieldMapper extends FieldMapper {
} }
} }
// Values held in the ngram index are encoded with special characters to denote start and end of values.
static String addLineEndChars(String value) {
return TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
}
@Override @Override
protected String contentType() { protected String contentType() {
return CONTENT_TYPE; return CONTENT_TYPE;

View File

@ -6,6 +6,7 @@
package org.elasticsearch.xpack.wildcard.mapper; package org.elasticsearch.xpack.wildcard.mapper;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField;
@ -15,20 +16,31 @@ import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort; import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.Version; import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.index.Index; import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexSettings;
@ -55,12 +67,22 @@ import java.util.HashSet;
import java.util.function.BiFunction; import java.util.function.BiFunction;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
public class WildcardFieldMapperTests extends ESTestCase { public class WildcardFieldMapperTests extends ESTestCase {
static QueryShardContext createMockQueryShardContext(boolean allowExpensiveQueries) {
QueryShardContext queryShardContext = mock(QueryShardContext.class);
when(queryShardContext.allowExpensiveQueries()).thenReturn(allowExpensiveQueries);
return queryShardContext;
}
private static final String KEYWORD_FIELD_NAME = "keyword_field"; private static final String KEYWORD_FIELD_NAME = "keyword_field";
private static final String WILDCARD_FIELD_NAME = "wildcard_field"; private static final String WILDCARD_FIELD_NAME = "wildcard_field";
static final int MAX_FIELD_LENGTH = 100; public static final QueryShardContext MOCK_QSC = createMockQueryShardContext(true);
static final int MAX_FIELD_LENGTH = 30;
static WildcardFieldMapper wildcardFieldType; static WildcardFieldMapper wildcardFieldType;
static KeywordFieldMapper keywordFieldType; static KeywordFieldMapper keywordFieldType;
@ -136,11 +158,18 @@ public class WildcardFieldMapperTests extends ESTestCase {
IndexSearcher searcher = newSearcher(reader); IndexSearcher searcher = newSearcher(reader);
iw.close(); iw.close();
// Test wildcard query
String queryString = randomABString((BooleanQuery.getMaxClauseCount() * 2) + 1); String queryString = randomABString((BooleanQuery.getMaxClauseCount() * 2) + 1);
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(queryString, null, null); Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(queryString, null, null);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER); TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L)); assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
// Test regexp query
wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(queryString, RegExp.ALL, 20000, null, MOCK_QSC);
wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
reader.close(); reader.close();
dir.close(); dir.close();
} }
@ -181,15 +210,59 @@ public class WildcardFieldMapperTests extends ESTestCase {
int numSearches = 100; int numSearches = 100;
for (int i = 0; i < numSearches; i++) { for (int i = 0; i < numSearches; i++) {
String randomWildcardPattern = getRandomWildcardPattern();
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(randomWildcardPattern, null, null); Query wildcardFieldQuery = null;
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.INDEXORDER); Query keywordFieldQuery = null;
String pattern = null;
switch (randomInt(3)) {
case 0:
pattern = getRandomWildcardPattern();
wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
keywordFieldQuery = keywordFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
break;
case 1:
pattern = getRandomRegexPattern(values);
wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(pattern, RegExp.ALL, 20000, null, MOCK_QSC);
keywordFieldQuery = keywordFieldType.fieldType().regexpQuery(pattern, RegExp.ALL, 20000, null, MOCK_QSC);
break;
case 2:
pattern = randomABString(5);
wildcardFieldQuery = wildcardFieldType.fieldType().prefixQuery(pattern, null, MOCK_QSC);
keywordFieldQuery = keywordFieldType.fieldType().prefixQuery(pattern, null, MOCK_QSC);
break;
case 3:
int edits = randomInt(2);
int prefixLength = randomInt(4);
pattern = getRandomFuzzyPattern(values, edits, prefixLength);
Fuzziness fuzziness = Fuzziness.AUTO;
switch (edits) {
case 0:
fuzziness = Fuzziness.ZERO;
break;
case 1:
fuzziness = Fuzziness.ONE;
break;
case 2:
fuzziness = Fuzziness.TWO;
break;
default:
break;
}
// Prefix length shouldn't be longer than selected search string
// BUT keyword field has a bug with prefix length when equal - see https://github.com/elastic/elasticsearch/issues/55790
// so we opt for one less
prefixLength = Math.min(pattern.length() - 1 , prefixLength);
boolean transpositions = randomBoolean();
Query keywordFieldQuery = new WildcardQuery(new Term(KEYWORD_FIELD_NAME, randomWildcardPattern)); wildcardFieldQuery = wildcardFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50,
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.INDEXORDER); transpositions, MOCK_QSC);
keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50,
assertThat(kwTopDocs.totalHits.value, equalTo(wildcardFieldTopDocs.totalHits.value)); transpositions, MOCK_QSC);
break;
}
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE);
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value));
HashSet<Integer> expectedDocs = new HashSet<>(); HashSet<Integer> expectedDocs = new HashSet<>();
for (ScoreDoc topDoc : kwTopDocs.scoreDocs) { for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
@ -201,7 +274,6 @@ public class WildcardFieldMapperTests extends ESTestCase {
assertThat(expectedDocs.size(), equalTo(0)); assertThat(expectedDocs.size(), equalTo(0));
} }
//Test keyword and wildcard sort operations are also equivalent //Test keyword and wildcard sort operations are also equivalent
QueryShardContext shardContextMock = createMockShardContext(); QueryShardContext shardContextMock = createMockShardContext();
@ -222,7 +294,333 @@ public class WildcardFieldMapperTests extends ESTestCase {
dir.close(); dir.close();
} }
public void testRegexAcceleration() throws IOException, ParseException {
// All these expressions should rewrite to a match all with no verification step required at all
String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"};
for (String regex : superfastRegexes) {
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
assertTrue(wildcardFieldQuery instanceof DocValuesFieldExistsQuery);
}
String matchNoDocsRegexes[]= { ""};
for (String regex : matchNoDocsRegexes) {
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
assertTrue(wildcardFieldQuery instanceof MatchNoDocsQuery);
}
// All of these regexes should be accelerated as the equivalent of the given QueryString query
String acceleratedTests[][] = {
{".*foo.*", "foo"},
{"..foobar","+foo +oba +ar_ +r__"},
{"(maynotexist)?foobar","+foo +oba +ar_ +r__"},
{".*/etc/passw.*", "+\\/et +tc\\/ +\\/pa +ass +ssw"},
{".*etc/passwd", "+etc +c\\/p +pas +ssw +wd_ +d__"},
{"(http|ftp)://foo.*", "+((+htt +ttp) ftp) +(+\\:\\/\\/ +\\/fo +foo)"},
{"[Pp][Oo][Ww][Ee][Rr][Ss][Hh][Ee][Ll][Ll]\\.[Ee][Xx][Ee]", "+_po +owe +ers +she +ell +l\\.e +exe +e__"},
{"foo<1-100>bar", "+(+_fo +foo) +(+bar +r__ )"},
{"(aaa.+&.+bbb)cat", "+cat +t__"},
{".a", "a__"}
};
for (String[] test : acceleratedTests) {
String regex = test[0];
String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
testExpectedAccelerationQuery(regex, wildcardFieldQuery, expectedAccelerationQueryString);
}
// All these expressions should rewrite to just the verification query (there's no ngram acceleration)
// TODO we can possibly improve on some of these
String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb"};
for (String regex : matchAllButVerifyTests) {
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
assertTrue(regex +" was not a pure verify query " +formatQuery(wildcardFieldQuery),
wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv);
}
// Documentation - regexes that do try accelerate but we would like to improve in future versions.
String suboptimalTests[][] = {
// TODO short wildcards like a* OR b* aren't great so we just drop them.
// Ideally we would attach to successors to create (acd OR bcd)
{ "[ab]cd", "+cd_ +d__"}
};
for (String[] test : suboptimalTests) {
String regex = test[0];
String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
testExpectedAccelerationQuery(regex, wildcardFieldQuery, expectedAccelerationQueryString);
}
}
// Make error messages more readable
String formatQuery(Query q) {
return q.toString().replaceAll(WILDCARD_FIELD_NAME+":", "").replaceAll(WildcardFieldMapper.TOKEN_START_STRING, "_");
}
public void testWildcardAcceleration() throws IOException, ParseException {
// All these expressions should rewrite to MatchAll with no verification step required at all
String superfastPattern[] = { "*", "**", "*?" };
for (String pattern : superfastPattern) {
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
assertTrue(
pattern + " was not a pure match all query " + formatQuery(wildcardFieldQuery),
wildcardFieldQuery instanceof DocValuesFieldExistsQuery
);
}
// All of these patterns should be accelerated.
String tests[][] = {
{ "*foobar", "+foo +oba +ar_ +r__" },
{ "foobar*", "+_fo +oob +bar" },
{ "foo\\*bar*", "+_fo +oo\\* +\\*ba +bar" },
{ "foo\\?bar*", "+_fo +oo\\? +\\?ba +bar" },
{ "foo*bar", "+_fo +foo +bar +r__" },
{ "foo?bar", "+_fo +foo +bar +r__" },
{ "?foo*bar?", "+foo +bar" },
{ "*c", "+c__" } };
for (String[] test : tests) {
String pattern = test[0];
String expectedAccelerationQueryString = test[1].replaceAll("_", "" + WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
testExpectedAccelerationQuery(pattern, wildcardFieldQuery, expectedAccelerationQueryString);
assertTrue(wildcardFieldQuery instanceof BooleanQuery);
}
// TODO All these expressions have no acceleration at all and could be improved
String slowPatterns[] = { "??" };
for (String pattern : slowPatterns) {
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
assertTrue(
pattern + " was not as slow as we assumed " + formatQuery(wildcardFieldQuery),
wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv
);
}
}
static class FuzzyTest {
String pattern;
int prefixLength;
Fuzziness fuzziness;
String expectedPrefixQuery;
int expectedMinShouldMatch;
String ngrams;
FuzzyTest(
String pattern,
int prefixLength,
Fuzziness fuzziness,
String expectedPrefixQuery,
int expectedMinShouldMatch,
String ngrams
) {
super();
this.pattern = pattern;
this.prefixLength = prefixLength;
this.fuzziness = fuzziness;
this.expectedPrefixQuery = expectedPrefixQuery;
this.expectedMinShouldMatch = expectedMinShouldMatch;
this.ngrams = ngrams;
}
Query getFuzzyQuery() {
return wildcardFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50, true, MOCK_QSC);
}
Query getExpectedApproxQuery() throws ParseException {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
if (expectedPrefixQuery != null) {
String[] tokens = expectedPrefixQuery.split(" ");
Query prefixQuery = null;
if (tokens.length == 1) {
prefixQuery = new TermQuery(
new Term(WILDCARD_FIELD_NAME, tokens[0].replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
);
} else {
BooleanQuery.Builder pqb = new BooleanQuery.Builder();
for (String token : tokens) {
Query ngramQuery = new TermQuery(
new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
);
pqb.add(ngramQuery, Occur.MUST);
}
prefixQuery = pqb.build();
}
if (ngrams == null) {
return prefixQuery;
}
bq.add(prefixQuery, Occur.MUST);
}
if (ngrams != null) {
BooleanQuery.Builder nq = new BooleanQuery.Builder();
String[] tokens = ngrams.split(" ");
for (String token : tokens) {
Query ngramQuery = new TermQuery(
new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
);
nq.add(ngramQuery, Occur.SHOULD);
}
nq.setMinimumNumberShouldMatch(expectedMinShouldMatch);
bq.add(nq.build(), Occur.MUST);
}
return bq.build();
}
}
public void testFuzzyAcceleration() throws IOException, ParseException {
FuzzyTest[] tests = {
new FuzzyTest("123456", 0, Fuzziness.ONE, null, 1, "123 456"),
new FuzzyTest("1234567890", 2, Fuzziness.ONE, "_12", 1, "345 678"),
new FuzzyTest("12345678901", 2, Fuzziness.ONE, "_12", 2, "345 678 901"),
new FuzzyTest("12345678", 4, Fuzziness.ONE, "_12 234", 0, null)
};
for (FuzzyTest test : tests) {
Query wildcardFieldQuery = test.getFuzzyQuery();
testExpectedAccelerationQuery(test.pattern, wildcardFieldQuery, test.getExpectedApproxQuery());
}
}
void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {
QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
Query expectedAccelerationQuery = qsp.parse(expectedAccelerationQueryString);
testExpectedAccelerationQuery(regex, combinedQuery, expectedAccelerationQuery);
}
void testExpectedAccelerationQuery(String regex, Query combinedQuery, Query expectedAccelerationQuery) throws ParseException {
BooleanQuery cq = (BooleanQuery) combinedQuery;
assert cq.clauses().size() == 2;
Query approximationQuery = null;
boolean verifyQueryFound = false;
for (BooleanClause booleanClause : cq.clauses()) {
Query q = booleanClause.getQuery();
if (q instanceof AutomatonQueryOnBinaryDv) {
verifyQueryFound = true;
} else {
approximationQuery = q;
}
}
assert verifyQueryFound;
String message = "regex: "+ regex +"\nactual query: " + formatQuery(approximationQuery) +
"\nexpected query: " + formatQuery(expectedAccelerationQuery) + "\n";
assertEquals(message, expectedAccelerationQuery, approximationQuery);
}
private String getRandomFuzzyPattern(HashSet<String> values, int edits, int prefixLength) {
assert edits >=0 && edits <=2;
// Pick one of the indexed document values to focus our queries on.
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
if (edits == 0) {
return randomValue;
}
if (randomValue.length() > prefixLength) {
randomValue = randomValue.substring(0,prefixLength) + "C" + randomValue.substring(prefixLength);
edits--;
}
if(edits > 0) {
randomValue = randomValue + "a";
}
return randomValue;
}
private String getRandomRegexPattern(HashSet<String> values) {
// Pick one of the indexed document values to focus our queries on.
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
return convertToRandomRegex(randomValue);
}
// Produces a random regex string guaranteed to match the provided value
protected String convertToRandomRegex(String randomValue) {
StringBuilder result = new StringBuilder();
//Pick a part of the string to change
int substitutionPoint = randomIntBetween(0, randomValue.length()-1);
int substitutionLength = randomIntBetween(1, Math.min(10, randomValue.length() - substitutionPoint));
//Add any head to the result, unchanged
if(substitutionPoint >0) {
result.append(randomValue.substring(0,substitutionPoint));
}
// Modify the middle...
String replacementPart = randomValue.substring(substitutionPoint, substitutionPoint+substitutionLength);
int mutation = randomIntBetween(0, 11);
switch (mutation) {
case 0:
// OR with random alpha of same length
result.append("("+replacementPart+"|c"+ randomABString(replacementPart.length())+")");
break;
case 1:
// OR with non-existant value
result.append("("+replacementPart+"|doesnotexist)");
break;
case 2:
// OR with another randomised regex (used to create nested levels of expression).
result.append("(" + convertToRandomRegex(replacementPart) +"|doesnotexist)");
break;
case 3:
// Star-replace all ab sequences.
result.append(replacementPart.replaceAll("ab", ".*"));
break;
case 4:
// .-replace all b chars
result.append(replacementPart.replaceAll("b", "."));
break;
case 5:
// length-limited stars {1,2}
result.append(".{1,"+replacementPart.length()+"}");
break;
case 6:
// replace all chars with .
result.append(replacementPart.replaceAll(".", "."));
break;
case 7:
// OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
char [] chars = replacementPart.toCharArray();
for (char c : chars) {
result.append("[" + c + Character.toUpperCase(c) +"]");
}
break;
case 8:
// NOT a character - replace all b's with "not a"
result.append(replacementPart.replaceAll("b", "[^a]"));
break;
case 9:
// Make whole part repeatable 1 or more times
result.append("(" + replacementPart +")+");
break;
case 10:
// Make whole part repeatable 0 or more times
result.append("(" + replacementPart +")?");
break;
case 11:
// all but ... syntax
result.append("@&~(doesnotexist.+)");
break;
default:
break;
}
//add any remaining tail, unchanged
if(substitutionPoint + substitutionLength <= randomValue.length()-1) {
result.append(randomValue.substring(substitutionPoint + substitutionLength));
}
//Assert our randomly generated regex actually matches the provided raw input.
RegExp regex = new RegExp(result.toString());
Automaton automaton = regex.toAutomaton();
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
BytesRef br = new BytesRef(randomValue);
assertTrue("[" + result.toString() + "]should match [" + randomValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
+ randomValue.length(), bytesMatcher.run(br.bytes, br.offset, br.length));
return result.toString();
}
protected MappedFieldType provideMappedFieldType(String name) { protected MappedFieldType provideMappedFieldType(String name) {
if (name.equals(WILDCARD_FIELD_NAME)) { if (name.equals(WILDCARD_FIELD_NAME)) {
@ -284,7 +682,11 @@ public class WildcardFieldMapperTests extends ESTestCase {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
while (sb.length() < minLength) { while (sb.length() < minLength) {
if (randomBoolean()) { if (randomBoolean()) {
sb.append("a"); if (randomBoolean()) {
sb.append("a");
} else {
sb.append("A");
}
} else { } else {
sb.append("b"); sb.append("b");
} }