Backport of #55548 Adds equivalence for keyword field to the wildcard field. Regex, fuzzy, wildcard and prefix queries are all supported. All queries use an approximation query backed by an automaton-based verification queries. Closes #54275
This commit is contained in:
parent
9f1e3bc82b
commit
b2bc6071fd
|
@ -1,8 +1,8 @@
|
||||||
setup:
|
setup:
|
||||||
- skip:
|
- skip:
|
||||||
features: headers
|
features: headers
|
||||||
version: " - 7.7.99"
|
version: " - 7.8.99"
|
||||||
reason: "wildcard fields were added from 7.8"
|
reason: "wildcard fields were added from 7.9"
|
||||||
|
|
||||||
- do:
|
- do:
|
||||||
indices.create:
|
indices.create:
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Query that runs an Automaton across all binary doc values.
|
* Query that runs an Automaton across all binary doc values.
|
||||||
|
@ -33,18 +34,19 @@ public class AutomatonQueryOnBinaryDv extends Query {
|
||||||
|
|
||||||
private final String field;
|
private final String field;
|
||||||
private final String matchPattern;
|
private final String matchPattern;
|
||||||
private final Automaton automaton;
|
private final Supplier<Automaton> automatonSupplier;
|
||||||
|
|
||||||
public AutomatonQueryOnBinaryDv(String field, String matchPattern, Automaton automaton) {
|
public AutomatonQueryOnBinaryDv(String field, String matchPattern, Supplier<Automaton> automatonSupplier) {
|
||||||
this.field = field;
|
this.field = field;
|
||||||
this.matchPattern = matchPattern;
|
this.matchPattern = matchPattern;
|
||||||
this.automaton = automaton;
|
this.automatonSupplier = automatonSupplier;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
||||||
|
|
||||||
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
|
|
||||||
|
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automatonSupplier.get());
|
||||||
|
|
||||||
return new ConstantScoreWeight(this, boost) {
|
return new ConstantScoreWeight(this, boost) {
|
||||||
|
|
||||||
|
@ -92,8 +94,11 @@ public class AutomatonQueryOnBinaryDv extends Query {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
|
if (obj == null || obj.getClass() != getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
AutomatonQueryOnBinaryDv other = (AutomatonQueryOnBinaryDv) obj;
|
AutomatonQueryOnBinaryDv other = (AutomatonQueryOnBinaryDv) obj;
|
||||||
return Objects.equals(field, other.field) && Objects.equals(matchPattern, other.matchPattern);
|
return Objects.equals(field, other.field) && Objects.equals(matchPattern, other.matchPattern);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
/*
|
||||||
|
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
||||||
|
* or more contributor license agreements. Licensed under the Elastic License;
|
||||||
|
* you may not use this file except in compliance with the Elastic License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.xpack.wildcard.mapper;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.QueryVisitor;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A query that matches all documents. The class is more of a marker
|
||||||
|
* that we encountered something that will need verification.
|
||||||
|
* (A MatchAllDocs query is used to indicate we can match all
|
||||||
|
* _without_ verification)
|
||||||
|
*/
|
||||||
|
public final class MatchAllButRequireVerificationQuery extends Query {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Query rewrite(IndexReader reader) throws IOException {
|
||||||
|
return new MatchAllDocsQuery();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString(String field) {
|
||||||
|
return "*:* (tbc)";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
return sameClassAs(o);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return classHash();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(QueryVisitor visitor) {
|
||||||
|
visitor.visitLeaf(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -21,24 +21,32 @@ import org.apache.lucene.search.BooleanClause.Occur;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.ConstantScoreQuery;
|
import org.apache.lucene.search.ConstantScoreQuery;
|
||||||
import org.apache.lucene.search.DocValuesFieldExistsQuery;
|
import org.apache.lucene.search.DocValuesFieldExistsQuery;
|
||||||
|
import org.apache.lucene.search.FuzzyQuery;
|
||||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||||
|
import org.apache.lucene.search.MatchNoDocsQuery;
|
||||||
import org.apache.lucene.search.MultiTermQuery;
|
import org.apache.lucene.search.MultiTermQuery;
|
||||||
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
|
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
|
||||||
|
import org.apache.lucene.search.PrefixQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.SortField;
|
import org.apache.lucene.search.SortField;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.WildcardQuery;
|
import org.apache.lucene.search.WildcardQuery;
|
||||||
import org.apache.lucene.util.automaton.Automaton;
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
|
import org.apache.lucene.util.automaton.RegExp;
|
||||||
|
import org.apache.lucene.util.automaton.RegExp.Kind;
|
||||||
|
import org.elasticsearch.ElasticsearchException;
|
||||||
import org.elasticsearch.ElasticsearchParseException;
|
import org.elasticsearch.ElasticsearchParseException;
|
||||||
import org.elasticsearch.common.lucene.BytesRefs;
|
import org.elasticsearch.common.lucene.BytesRefs;
|
||||||
import org.elasticsearch.common.lucene.Lucene;
|
import org.elasticsearch.common.lucene.Lucene;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.common.unit.Fuzziness;
|
||||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||||
import org.elasticsearch.common.xcontent.XContentParser;
|
import org.elasticsearch.common.xcontent.XContentParser;
|
||||||
import org.elasticsearch.common.xcontent.support.XContentMapValues;
|
import org.elasticsearch.common.xcontent.support.XContentMapValues;
|
||||||
import org.elasticsearch.index.Index;
|
import org.elasticsearch.index.Index;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.index.analysis.AnalyzerScope;
|
import org.elasticsearch.index.analysis.AnalyzerScope;
|
||||||
|
import org.elasticsearch.index.analysis.LowercaseNormalizer;
|
||||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||||
import org.elasticsearch.index.fielddata.IndexFieldData;
|
import org.elasticsearch.index.fielddata.IndexFieldData;
|
||||||
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
|
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
|
||||||
|
@ -63,11 +71,16 @@ import org.elasticsearch.search.aggregations.support.ValuesSourceType;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedHashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
import static org.elasticsearch.index.mapper.TypeParsers.parseField;
|
import static org.elasticsearch.index.mapper.TypeParsers.parseField;
|
||||||
|
import static org.elasticsearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link FieldMapper} for indexing fields with ngrams for efficient wildcard matching
|
* A {@link FieldMapper} for indexing fields with ngrams for efficient wildcard matching
|
||||||
|
@ -206,8 +219,12 @@ public class WildcardFieldMapper extends FieldMapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static final char TOKEN_START_OR_END_CHAR = 0;
|
public static final char TOKEN_START_OR_END_CHAR = 0;
|
||||||
|
public static final String TOKEN_START_STRING = Character.toString(TOKEN_START_OR_END_CHAR);
|
||||||
|
public static final String TOKEN_END_STRING = TOKEN_START_STRING + TOKEN_START_STRING;
|
||||||
|
|
||||||
public static final class WildcardFieldType extends MappedFieldType {
|
public static final class WildcardFieldType extends MappedFieldType {
|
||||||
|
|
||||||
|
static Analyzer lowercaseNormalizer = new LowercaseNormalizer();
|
||||||
|
|
||||||
public WildcardFieldType() {
|
public WildcardFieldType() {
|
||||||
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
|
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
|
||||||
|
@ -223,218 +240,533 @@ public class WildcardFieldMapper extends FieldMapper {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Holds parsed information about the wildcard pattern
|
|
||||||
static class PatternStructure {
|
|
||||||
boolean openStart, openEnd, hasSymbols;
|
|
||||||
int lastGap =0;
|
|
||||||
int wildcardCharCount, wildcardStringCount;
|
|
||||||
String[] fragments;
|
|
||||||
Integer [] precedingGapSizes;
|
|
||||||
final String pattern;
|
|
||||||
|
|
||||||
@SuppressWarnings("fallthrough") // Intentionally uses fallthrough mirroring implementation in Lucene's WildcardQuery
|
|
||||||
PatternStructure (String wildcardText) {
|
|
||||||
this.pattern = wildcardText;
|
|
||||||
ArrayList<String> fragmentList = new ArrayList<>();
|
|
||||||
ArrayList<Integer> precedingGapSizeList = new ArrayList<>();
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
for (int i = 0; i < wildcardText.length();) {
|
|
||||||
final int c = wildcardText.codePointAt(i);
|
|
||||||
int length = Character.charCount(c);
|
|
||||||
switch (c) {
|
|
||||||
case WildcardQuery.WILDCARD_STRING:
|
|
||||||
if (i == 0) {
|
|
||||||
openStart = true;
|
|
||||||
}
|
|
||||||
openEnd = true;
|
|
||||||
hasSymbols = true;
|
|
||||||
wildcardStringCount++;
|
|
||||||
|
|
||||||
if (sb.length() > 0) {
|
|
||||||
precedingGapSizeList.add(lastGap);
|
|
||||||
fragmentList.add(sb.toString());
|
|
||||||
sb = new StringBuilder();
|
|
||||||
}
|
|
||||||
lastGap = Integer.MAX_VALUE;
|
|
||||||
break;
|
|
||||||
case WildcardQuery.WILDCARD_CHAR:
|
|
||||||
if (i == 0) {
|
|
||||||
openStart = true;
|
|
||||||
}
|
|
||||||
hasSymbols = true;
|
|
||||||
wildcardCharCount++;
|
|
||||||
openEnd = true;
|
|
||||||
if (sb.length() > 0) {
|
|
||||||
precedingGapSizeList.add(lastGap);
|
|
||||||
fragmentList.add(sb.toString());
|
|
||||||
sb = new StringBuilder();
|
|
||||||
lastGap = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lastGap != Integer.MAX_VALUE) {
|
|
||||||
lastGap++;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case WildcardQuery.WILDCARD_ESCAPE:
|
|
||||||
// add the next codepoint instead, if it exists
|
|
||||||
if (i + length < wildcardText.length()) {
|
|
||||||
final int nextChar = wildcardText.codePointAt(i + length);
|
|
||||||
length += Character.charCount(nextChar);
|
|
||||||
sb.append(Character.toChars(nextChar));
|
|
||||||
openEnd = false;
|
|
||||||
break;
|
|
||||||
} // else fallthru, lenient parsing with a trailing \
|
|
||||||
default:
|
|
||||||
openEnd = false;
|
|
||||||
sb.append(Character.toChars(c));
|
|
||||||
}
|
|
||||||
i += length;
|
|
||||||
}
|
|
||||||
if (sb.length() > 0) {
|
|
||||||
precedingGapSizeList.add(lastGap);
|
|
||||||
fragmentList.add(sb.toString());
|
|
||||||
lastGap = 0;
|
|
||||||
}
|
|
||||||
fragments = fragmentList.toArray(new String[0]);
|
|
||||||
precedingGapSizes = precedingGapSizeList.toArray(new Integer[0]);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean needsVerification() {
|
|
||||||
// Return true if term queries are not enough evidence
|
|
||||||
if (fragments.length == 1 && wildcardCharCount == 0) {
|
|
||||||
// The one case where we don't need verification is when
|
|
||||||
// we have a single fragment and no ? characters
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns number of positions for last gap (Integer.MAX means unlimited gap)
|
|
||||||
public int getPrecedingGapSize(int fragmentNum) {
|
|
||||||
return precedingGapSizes[fragmentNum];
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isMatchAll() {
|
|
||||||
return fragments.length == 0 && wildcardStringCount >0 && wildcardCharCount ==0;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
return pattern.hashCode();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object obj) {
|
|
||||||
PatternStructure other = (PatternStructure) obj;
|
|
||||||
return pattern.equals(other.pattern);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) {
|
public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) {
|
||||||
PatternStructure patternStructure = new PatternStructure(wildcardPattern);
|
|
||||||
ArrayList<String> tokens = new ArrayList<>();
|
|
||||||
|
|
||||||
for (int i = 0; i < patternStructure.fragments.length; i++) {
|
String ngramIndexPattern = addLineEndChars(toLowerCase(wildcardPattern));
|
||||||
String fragment = patternStructure.fragments[i];
|
|
||||||
int fLength = fragment.length();
|
|
||||||
if (fLength == 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add any start/end of string character
|
// Break search term into tokens
|
||||||
if (i == 0 && patternStructure.openStart == false) {
|
Set<String> tokens = new LinkedHashSet<>();
|
||||||
// Start-of-string anchored (is not a leading wildcard)
|
StringBuilder sequence = new StringBuilder();
|
||||||
fragment = TOKEN_START_OR_END_CHAR + fragment;
|
int numWildcardChars = 0;
|
||||||
|
int numWildcardStrings = 0;
|
||||||
|
for (int i = 0; i < ngramIndexPattern.length();) {
|
||||||
|
final int c = ngramIndexPattern.codePointAt(i);
|
||||||
|
int length = Character.charCount(c);
|
||||||
|
switch (c) {
|
||||||
|
case WildcardQuery.WILDCARD_STRING:
|
||||||
|
if (sequence.length() > 0) {
|
||||||
|
getNgramTokens(tokens, sequence.toString());
|
||||||
|
sequence = new StringBuilder();
|
||||||
|
}
|
||||||
|
numWildcardStrings++;
|
||||||
|
break;
|
||||||
|
case WildcardQuery.WILDCARD_CHAR:
|
||||||
|
if (sequence.length() > 0) {
|
||||||
|
getNgramTokens(tokens, sequence.toString());
|
||||||
|
sequence = new StringBuilder();
|
||||||
|
}
|
||||||
|
numWildcardChars++;
|
||||||
|
break;
|
||||||
|
case WildcardQuery.WILDCARD_ESCAPE:
|
||||||
|
// add the next codepoint instead, if it exists
|
||||||
|
if (i + length < ngramIndexPattern.length()) {
|
||||||
|
final int nextChar = ngramIndexPattern.codePointAt(i + length);
|
||||||
|
length += Character.charCount(nextChar);
|
||||||
|
sequence.append(Character.toChars(nextChar));
|
||||||
|
} else {
|
||||||
|
sequence.append(Character.toChars(c));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
sequence.append(Character.toChars(c));
|
||||||
}
|
}
|
||||||
if (patternStructure.openEnd == false && i == patternStructure.fragments.length - 1) {
|
i += length;
|
||||||
// End-of-string anchored (is not a trailing wildcard)
|
}
|
||||||
fragment = fragment + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
|
|
||||||
|
if (sequence.length() > 0) {
|
||||||
|
getNgramTokens(tokens, sequence.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
|
||||||
|
int clauseCount = 0;
|
||||||
|
for (String string : tokens) {
|
||||||
|
if (clauseCount >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (fragment.codePointCount(0, fragment.length()) <= NGRAM_SIZE) {
|
addClause(string, rewritten, Occur.MUST);
|
||||||
tokens.add(fragment);
|
clauseCount++;
|
||||||
|
}
|
||||||
|
Supplier<Automaton> deferredAutomatonSupplier = () -> {
|
||||||
|
return WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
|
||||||
|
};
|
||||||
|
AutomatonQueryOnBinaryDv verifyingQuery = new AutomatonQueryOnBinaryDv(name(), wildcardPattern, deferredAutomatonSupplier);
|
||||||
|
if (clauseCount > 0) {
|
||||||
|
// We can accelerate execution with the ngram query
|
||||||
|
BooleanQuery approxQuery = rewritten.build();
|
||||||
|
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
|
||||||
|
verifyingBuilder.add(new BooleanClause(approxQuery, Occur.MUST));
|
||||||
|
verifyingBuilder.add(new BooleanClause(verifyingQuery, Occur.MUST));
|
||||||
|
return verifyingBuilder.build();
|
||||||
|
} else if (numWildcardChars == 0 || numWildcardStrings > 0) {
|
||||||
|
// We have no concrete characters and we're not a pure length query e.g. ???
|
||||||
|
return new DocValuesFieldExistsQuery(name());
|
||||||
|
}
|
||||||
|
return verifyingQuery;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Query regexpQuery(String value, int flags, int maxDeterminizedStates, RewriteMethod method, QueryShardContext context) {
|
||||||
|
if (value.length() == 0) {
|
||||||
|
return new MatchNoDocsQuery();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (context.allowExpensiveQueries() == false) {
|
||||||
|
throw new ElasticsearchException(
|
||||||
|
"[regexp] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
RegExp ngramRegex = new RegExp(addLineEndChars(toLowerCase(value)), flags);
|
||||||
|
|
||||||
|
Query approxBooleanQuery = toApproximationQuery(ngramRegex);
|
||||||
|
Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery);
|
||||||
|
|
||||||
|
// MatchAll is a special case meaning the regex is known to match everything .* and
|
||||||
|
// there is no need for verification.
|
||||||
|
if (approxNgramQuery instanceof MatchAllDocsQuery) {
|
||||||
|
return existsQuery(context);
|
||||||
|
}
|
||||||
|
Supplier<Automaton> deferredAutomatonSupplier = ()-> {
|
||||||
|
RegExp regex = new RegExp(value, flags);
|
||||||
|
return regex.toAutomaton(maxDeterminizedStates);
|
||||||
|
};
|
||||||
|
|
||||||
|
AutomatonQueryOnBinaryDv verifyingQuery = new AutomatonQueryOnBinaryDv(name(), value, deferredAutomatonSupplier);
|
||||||
|
|
||||||
|
// MatchAllButRequireVerificationQuery is a special case meaning the regex is reduced to a single
|
||||||
|
// clause which we can't accelerate at all and needs verification. Example would be ".."
|
||||||
|
if (approxNgramQuery instanceof MatchAllButRequireVerificationQuery) {
|
||||||
|
return verifyingQuery;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We can accelerate execution with the ngram query
|
||||||
|
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
|
||||||
|
verifyingBuilder.add(new BooleanClause(approxNgramQuery, Occur.MUST));
|
||||||
|
verifyingBuilder.add(new BooleanClause(verifyingQuery, Occur.MUST));
|
||||||
|
return verifyingBuilder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert a regular expression to a simplified query consisting of BooleanQuery and TermQuery objects
|
||||||
|
// which captures as much of the logic as possible. Query can produce some false positives but shouldn't
|
||||||
|
// produce any false negatives.
|
||||||
|
// In addition to Term and BooleanQuery clauses there are MatchAllDocsQuery objects (e.g for .*) and
|
||||||
|
// a RegExpQuery if we can't resolve to any of the above.
|
||||||
|
// * If an expression resolves to a single MatchAllDocsQuery eg .* then a match all shortcut is possible with
|
||||||
|
// no verification needed.
|
||||||
|
// * If an expression resolves to a RegExpQuery eg ?? then only the verification
|
||||||
|
// query is run.
|
||||||
|
// * Anything else is a concrete query that should be run on the ngram index.
|
||||||
|
public static Query toApproximationQuery(RegExp r) throws IllegalArgumentException {
|
||||||
|
Query result = null;
|
||||||
|
switch (r.kind) {
|
||||||
|
case REGEXP_UNION:
|
||||||
|
result = createUnionQuery(r);
|
||||||
|
break;
|
||||||
|
case REGEXP_CONCATENATION:
|
||||||
|
result = createConcatenationQuery(r);
|
||||||
|
break;
|
||||||
|
case REGEXP_STRING:
|
||||||
|
String normalizedString = toLowerCase(r.s);
|
||||||
|
result = new TermQuery(new Term("", normalizedString));
|
||||||
|
break;
|
||||||
|
case REGEXP_CHAR:
|
||||||
|
String cs = new StringBuilder().appendCodePoint(r.c).toString();
|
||||||
|
String normalizedChar = toLowerCase(cs);
|
||||||
|
result = new TermQuery(new Term("", normalizedChar));
|
||||||
|
break;
|
||||||
|
case REGEXP_REPEAT:
|
||||||
|
// Repeat is zero or more times so zero matches = match all
|
||||||
|
result = new MatchAllDocsQuery();
|
||||||
|
break;
|
||||||
|
|
||||||
|
case REGEXP_REPEAT_MIN:
|
||||||
|
case REGEXP_REPEAT_MINMAX:
|
||||||
|
if (r.min > 0) {
|
||||||
|
result = toApproximationQuery(r.exp1);
|
||||||
|
if(result instanceof TermQuery) {
|
||||||
|
// Wrap the repeating expression so that it is not concatenated by a parent which concatenates
|
||||||
|
// plain TermQuery objects together. Boolean queries are interpreted as a black box and not
|
||||||
|
// concatenated.
|
||||||
|
BooleanQuery.Builder wrapper = new BooleanQuery.Builder();
|
||||||
|
wrapper.add(result, Occur.MUST);
|
||||||
|
result = wrapper.build();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Expressions like (a){0,3} match empty string or up to 3 a's.
|
||||||
|
result = new MatchAllButRequireVerificationQuery();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case REGEXP_ANYSTRING:
|
||||||
|
// optimisation for .* queries - match all and no verification stage required.
|
||||||
|
result = new MatchAllDocsQuery();
|
||||||
|
break;
|
||||||
|
// All other kinds of expression cannot be represented as a boolean or term query so return an object
|
||||||
|
// that indicates verification is required
|
||||||
|
case REGEXP_OPTIONAL:
|
||||||
|
case REGEXP_INTERSECTION:
|
||||||
|
case REGEXP_COMPLEMENT:
|
||||||
|
case REGEXP_CHAR_RANGE:
|
||||||
|
case REGEXP_ANYCHAR:
|
||||||
|
case REGEXP_INTERVAL:
|
||||||
|
case REGEXP_EMPTY:
|
||||||
|
case REGEXP_AUTOMATON:
|
||||||
|
result = new MatchAllButRequireVerificationQuery();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
assert result != null; // All regex types are understood and translated to a query.
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Query createConcatenationQuery(RegExp r) {
|
||||||
|
// Create ANDs of expressions plus collapse consecutive TermQuerys into single longer ones
|
||||||
|
ArrayList<Query> queries = new ArrayList<>();
|
||||||
|
findLeaves(r.exp1, Kind.REGEXP_CONCATENATION, queries);
|
||||||
|
findLeaves(r.exp2, Kind.REGEXP_CONCATENATION, queries);
|
||||||
|
BooleanQuery.Builder bAnd = new BooleanQuery.Builder();
|
||||||
|
StringBuilder sequence = new StringBuilder();
|
||||||
|
for (Query query : queries) {
|
||||||
|
if (query instanceof TermQuery) {
|
||||||
|
TermQuery tq = (TermQuery) query;
|
||||||
|
sequence.append(tq.getTerm().text());
|
||||||
} else {
|
} else {
|
||||||
// Break fragment into multiple Ngrams
|
if (sequence.length() > 0) {
|
||||||
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment);
|
bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST);
|
||||||
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
|
sequence = new StringBuilder();
|
||||||
String lastUnusedToken = null;
|
}
|
||||||
try {
|
bAnd.add(query, Occur.MUST);
|
||||||
tokenizer.reset();
|
}
|
||||||
boolean takeThis = true;
|
}
|
||||||
// minimise number of terms searched - eg for "12345" and 3grams we only need terms
|
if (sequence.length() > 0) {
|
||||||
// `123` and `345` - no need to search for 234. We take every other ngram.
|
bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST);
|
||||||
while (tokenizer.incrementToken()) {
|
}
|
||||||
String tokenValue = termAtt.toString();
|
BooleanQuery combined = bAnd.build();
|
||||||
if (takeThis) {
|
if (combined.clauses().size() > 0) {
|
||||||
tokens.add(tokenValue);
|
return combined;
|
||||||
} else {
|
}
|
||||||
lastUnusedToken = tokenValue;
|
// There's something in the regex we couldn't represent as a query - resort to a match all with verification
|
||||||
|
return new MatchAllButRequireVerificationQuery();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Query createUnionQuery(RegExp r) {
|
||||||
|
// Create an OR of clauses
|
||||||
|
ArrayList<Query> queries = new ArrayList<>();
|
||||||
|
findLeaves(r.exp1, Kind.REGEXP_UNION, queries);
|
||||||
|
findLeaves(r.exp2, Kind.REGEXP_UNION, queries);
|
||||||
|
BooleanQuery.Builder bOr = new BooleanQuery.Builder();
|
||||||
|
HashSet<Query> uniqueClauses = new HashSet<>();
|
||||||
|
for (Query query : queries) {
|
||||||
|
if (uniqueClauses.add(query)) {
|
||||||
|
bOr.add(query, Occur.SHOULD);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (uniqueClauses.size() > 0) {
|
||||||
|
if (uniqueClauses.size() == 1) {
|
||||||
|
// Fully-understood ORs that collapse to a single term should be returned minus
|
||||||
|
// the BooleanQuery wrapper so that they might be concatenated.
|
||||||
|
// Helps turn [Pp][Oo][Ww][Ee][Rr][Ss][Hh][Ee][Ll][Ll] into "powershell"
|
||||||
|
// Each char pair eg (P OR p) can be normalized to (p) which can be a single term
|
||||||
|
return uniqueClauses.iterator().next();
|
||||||
|
} else {
|
||||||
|
return bOr.build();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// There's something in the regex we couldn't represent as a query - resort to a match all with verification
|
||||||
|
return new MatchAllButRequireVerificationQuery();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void findLeaves(RegExp exp, Kind kind, List<Query> queries) {
|
||||||
|
if (exp.kind == kind) {
|
||||||
|
findLeaves(exp.exp1, kind, queries);
|
||||||
|
findLeaves( exp.exp2, kind, queries);
|
||||||
|
} else {
|
||||||
|
queries.add(toApproximationQuery(exp));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String toLowerCase(String string) {
|
||||||
|
return lowercaseNormalizer.normalize(null, string).utf8ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Takes a BooleanQuery + TermQuery tree representing query logic and rewrites using ngrams of appropriate size.
|
||||||
|
private Query rewriteBoolToNgramQuery(Query approxQuery) {
|
||||||
|
//TODO optimise more intelligently so we:
|
||||||
|
// 1) favour full-length term queries eg abc over short eg a* when pruning too many clauses.
|
||||||
|
// 2) make MAX_CLAUSES_IN_APPROXIMATION_QUERY a global cap rather than per-boolean clause.
|
||||||
|
if (approxQuery == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (approxQuery instanceof BooleanQuery) {
|
||||||
|
BooleanQuery bq = (BooleanQuery) approxQuery;
|
||||||
|
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
|
||||||
|
int clauseCount = 0;
|
||||||
|
for (BooleanClause clause : bq) {
|
||||||
|
Query q = rewriteBoolToNgramQuery(clause.getQuery());
|
||||||
|
if (q != null) {
|
||||||
|
if (clause.getOccur().equals(Occur.MUST)) {
|
||||||
|
// Can't drop "should" clauses because it can elevate a sibling optional item
|
||||||
|
// to mandatory (shoulds with 1 clause) causing false negatives
|
||||||
|
// Dropping MUSTs increase false positives which are OK because are verified anyway.
|
||||||
|
clauseCount++;
|
||||||
|
if (clauseCount >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
// alternate
|
|
||||||
takeThis = !takeThis;
|
|
||||||
}
|
}
|
||||||
if (lastUnusedToken != null) {
|
rewritten.add(q, clause.getOccur());
|
||||||
// given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing
|
}
|
||||||
// `ake` to complete the logic.
|
}
|
||||||
tokens.add(lastUnusedToken);
|
return simplify(rewritten.build());
|
||||||
}
|
}
|
||||||
tokenizer.end();
|
if (approxQuery instanceof TermQuery) {
|
||||||
tokenizer.close();
|
TermQuery tq = (TermQuery) approxQuery;
|
||||||
} catch (IOException ioe) {
|
|
||||||
throw new ElasticsearchParseException("Error parsing wildcard query pattern fragment [" + fragment + "]");
|
//Remove simple terms that are only string beginnings or ends.
|
||||||
|
String s = tq.getTerm().text();
|
||||||
|
if (s.equals(WildcardFieldMapper.TOKEN_START_STRING) || s.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
|
||||||
|
return new MatchAllButRequireVerificationQuery();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Break term into tokens
|
||||||
|
Set<String> tokens = new LinkedHashSet<>();
|
||||||
|
getNgramTokens(tokens, s);
|
||||||
|
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
|
||||||
|
for (String string : tokens) {
|
||||||
|
addClause(string, rewritten, Occur.MUST);
|
||||||
|
}
|
||||||
|
return simplify(rewritten.build());
|
||||||
|
}
|
||||||
|
if (isMatchAll(approxQuery)) {
|
||||||
|
return approxQuery;
|
||||||
|
}
|
||||||
|
throw new IllegalStateException("Invalid query type found parsing regex query:" + approxQuery);
|
||||||
|
}
|
||||||
|
|
||||||
|
static Query simplify(Query input) {
|
||||||
|
if (input instanceof BooleanQuery == false) {
|
||||||
|
return input;
|
||||||
|
}
|
||||||
|
BooleanQuery result = (BooleanQuery) input;
|
||||||
|
if (result.clauses().size() == 0) {
|
||||||
|
// A ".*" clause can produce zero clauses in which case we return MatchAll
|
||||||
|
return new MatchAllDocsQuery();
|
||||||
|
}
|
||||||
|
if (result.clauses().size() == 1) {
|
||||||
|
return simplify(result.clauses().get(0).getQuery());
|
||||||
|
}
|
||||||
|
|
||||||
|
// We may have a mix of MatchAll and concrete queries - assess if we can simplify
|
||||||
|
int matchAllCount = 0;
|
||||||
|
int verifyCount = 0;
|
||||||
|
boolean allConcretesAreOptional = true;
|
||||||
|
for (BooleanClause booleanClause : result.clauses()) {
|
||||||
|
Query q = booleanClause.getQuery();
|
||||||
|
if (q instanceof MatchAllDocsQuery) {
|
||||||
|
matchAllCount++;
|
||||||
|
} else if (q instanceof MatchAllButRequireVerificationQuery) {
|
||||||
|
verifyCount++;
|
||||||
|
} else {
|
||||||
|
// Concrete query
|
||||||
|
if (booleanClause.getOccur() != Occur.SHOULD) {
|
||||||
|
allConcretesAreOptional = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (patternStructure.isMatchAll()) {
|
if ((allConcretesAreOptional && matchAllCount > 0)) {
|
||||||
|
// Any match all expression takes precedence over all optional concrete queries.
|
||||||
return new MatchAllDocsQuery();
|
return new MatchAllDocsQuery();
|
||||||
}
|
}
|
||||||
BooleanQuery approximation = createApproximationQuery(tokens);
|
|
||||||
if (approximation.clauses().size() > 1 || patternStructure.needsVerification()) {
|
|
||||||
BooleanQuery.Builder verifyingBuilder = new BooleanQuery.Builder();
|
|
||||||
verifyingBuilder.add(new BooleanClause(approximation, Occur.MUST));
|
|
||||||
Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), wildcardPattern));
|
|
||||||
verifyingBuilder.add(new BooleanClause(new AutomatonQueryOnBinaryDv(name(), wildcardPattern, automaton), Occur.MUST));
|
|
||||||
return verifyingBuilder.build();
|
|
||||||
}
|
|
||||||
return approximation;
|
|
||||||
}
|
|
||||||
|
|
||||||
private BooleanQuery createApproximationQuery(ArrayList<String> tokens) {
|
if ((allConcretesAreOptional && verifyCount > 0)) {
|
||||||
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
|
// Any match all expression that needs verification takes precedence over all optional concrete queries.
|
||||||
if (tokens.size() <= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
|
return new MatchAllButRequireVerificationQuery();
|
||||||
for (String token : tokens) {
|
}
|
||||||
addClause(token, bqBuilder);
|
|
||||||
|
// We have some mandatory concrete queries - strip out the superfluous match all expressions
|
||||||
|
if (allConcretesAreOptional == false && matchAllCount + verifyCount > 0) {
|
||||||
|
BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
|
||||||
|
for (BooleanClause booleanClause : result.clauses()) {
|
||||||
|
if (isMatchAll(booleanClause.getQuery()) == false) {
|
||||||
|
rewritten.add(booleanClause);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return bqBuilder.build();
|
return simplify(rewritten.build());
|
||||||
}
|
}
|
||||||
// Thin out the number of clauses using a selection spread evenly across the range
|
return result;
|
||||||
float step = (float) (tokens.size() - 1) / (float) (MAX_CLAUSES_IN_APPROXIMATION_QUERY - 1); // set step size
|
}
|
||||||
for (int i = 0; i < MAX_CLAUSES_IN_APPROXIMATION_QUERY; i++) {
|
|
||||||
addClause(tokens.get(Math.round(step * i)), bqBuilder); // add each element of a position which is a multiple of step
|
|
||||||
}
|
static boolean isMatchAll(Query q) {
|
||||||
// TODO we can be smarter about pruning here. e.g.
|
return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
|
||||||
// * Avoid wildcard queries if there are sufficient numbers of other terms that are full 3grams that are cheaper term queries
|
|
||||||
// * We can select terms on their scarcity rather than even spreads across the search string.
|
|
||||||
|
|
||||||
return bqBuilder.build();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addClause(String token, BooleanQuery.Builder bqBuilder) {
|
protected void getNgramTokens(Set<String> tokens, String fragment) {
|
||||||
assert token.codePointCount(0, token.length()) <= NGRAM_SIZE;
|
if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
|
||||||
if (token.codePointCount(0, token.length()) == NGRAM_SIZE) {
|
// If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search
|
||||||
TermQuery tq = new TermQuery(new Term(name(), token));
|
// terms which can be ignored.
|
||||||
bqBuilder.add(new BooleanClause(tq, Occur.MUST));
|
return;
|
||||||
} else {
|
|
||||||
WildcardQuery wq = new WildcardQuery(new Term(name(), token + "*"));
|
|
||||||
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
|
|
||||||
bqBuilder.add(new BooleanClause(wq, Occur.MUST));
|
|
||||||
}
|
}
|
||||||
|
// Break fragment into multiple Ngrams
|
||||||
|
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), fragment);
|
||||||
|
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
|
||||||
|
// If fragment length < NGRAM_SIZE then it is not emitted by token stream so need
|
||||||
|
// to initialise with the value here
|
||||||
|
String lastUnusedToken = fragment;
|
||||||
|
try {
|
||||||
|
tokenizer.reset();
|
||||||
|
boolean takeThis = true;
|
||||||
|
// minimise number of terms searched - eg for "12345" and 3grams we only need terms
|
||||||
|
// `123` and `345` - no need to search for 234. We take every other ngram.
|
||||||
|
while (tokenizer.incrementToken()) {
|
||||||
|
String tokenValue = termAtt.toString();
|
||||||
|
if (takeThis) {
|
||||||
|
tokens.add(tokenValue);
|
||||||
|
lastUnusedToken = null;
|
||||||
|
} else {
|
||||||
|
lastUnusedToken = tokenValue;
|
||||||
|
}
|
||||||
|
// alternate
|
||||||
|
takeThis = !takeThis;
|
||||||
|
if (tokens.size() >= MAX_CLAUSES_IN_APPROXIMATION_QUERY) {
|
||||||
|
lastUnusedToken = null;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (lastUnusedToken != null) {
|
||||||
|
// given `cake` and 3 grams the loop above would output only `cak` and we need to add trailing
|
||||||
|
// `ake` to complete the logic.
|
||||||
|
tokens.add(lastUnusedToken);
|
||||||
|
}
|
||||||
|
tokenizer.end();
|
||||||
|
tokenizer.close();
|
||||||
|
} catch (IOException ioe) {
|
||||||
|
throw new ElasticsearchParseException("Error parsing wildcard regex pattern fragment [" + fragment + "]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addClause(String token, BooleanQuery.Builder bqBuilder, Occur occur) {
|
||||||
|
assert token.codePointCount(0, token.length()) <= NGRAM_SIZE;
|
||||||
|
int tokenSize = token.codePointCount(0, token.length());
|
||||||
|
if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
|
||||||
|
// there's something concrete to be searched but it's too short
|
||||||
|
// Require verification.
|
||||||
|
bqBuilder.add(new BooleanClause(new MatchAllButRequireVerificationQuery(), occur));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (tokenSize == NGRAM_SIZE) {
|
||||||
|
TermQuery tq = new TermQuery(new Term(name(), token));
|
||||||
|
bqBuilder.add(new BooleanClause(tq, occur));
|
||||||
|
} else {
|
||||||
|
PrefixQuery wq = new PrefixQuery(new Term(name(), token));
|
||||||
|
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
|
||||||
|
bqBuilder.add(new BooleanClause(wq, occur));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Query fuzzyQuery(
|
||||||
|
Object value,
|
||||||
|
Fuzziness fuzziness,
|
||||||
|
int prefixLength,
|
||||||
|
int maxExpansions,
|
||||||
|
boolean transpositions,
|
||||||
|
QueryShardContext context
|
||||||
|
) {
|
||||||
|
String searchTerm = BytesRefs.toString(value);
|
||||||
|
String lowerSearchTerm = toLowerCase(searchTerm);
|
||||||
|
try {
|
||||||
|
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
|
||||||
|
//The approximation query can have a prefix and any number of ngrams.
|
||||||
|
BooleanQuery.Builder approxBuilder = new BooleanQuery.Builder();
|
||||||
|
|
||||||
|
String postPrefixString = lowerSearchTerm;
|
||||||
|
|
||||||
|
// Add all content prior to prefixLength as a MUST clause to the ngram index query
|
||||||
|
if (prefixLength > 0) {
|
||||||
|
Set<String> prefixTokens = new LinkedHashSet<>();
|
||||||
|
postPrefixString = lowerSearchTerm.substring(prefixLength);
|
||||||
|
String prefixCandidate = TOKEN_START_OR_END_CHAR + lowerSearchTerm.substring(0, prefixLength);
|
||||||
|
getNgramTokens(prefixTokens, prefixCandidate);
|
||||||
|
for (String prefixToken : prefixTokens) {
|
||||||
|
addClause(prefixToken, approxBuilder, Occur.MUST);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Tokenize all content after the prefix
|
||||||
|
TokenStream tokenizer = WILDCARD_ANALYZER.tokenStream(name(), postPrefixString);
|
||||||
|
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
|
||||||
|
ArrayList<String> postPrefixTokens = new ArrayList<>();
|
||||||
|
String firstToken = null;
|
||||||
|
tokenizer.reset();
|
||||||
|
int tokenNumber = 0;
|
||||||
|
while (tokenizer.incrementToken()) {
|
||||||
|
if (tokenNumber == 0) {
|
||||||
|
String token = termAtt.toString();
|
||||||
|
if (firstToken == null) {
|
||||||
|
firstToken = token;
|
||||||
|
}
|
||||||
|
postPrefixTokens.add(token);
|
||||||
|
}
|
||||||
|
// Take every 3rd ngram so they are all disjoint. Our calculation for min_should_match
|
||||||
|
// number relies on there being no overlaps
|
||||||
|
tokenNumber++;
|
||||||
|
if (tokenNumber == 3) {
|
||||||
|
tokenNumber = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tokenizer.end();
|
||||||
|
tokenizer.close();
|
||||||
|
|
||||||
|
BooleanQuery.Builder ngramBuilder = new BooleanQuery.Builder();
|
||||||
|
int numClauses = 0;
|
||||||
|
for (String token : postPrefixTokens) {
|
||||||
|
addClause(token, ngramBuilder, Occur.SHOULD);
|
||||||
|
numClauses++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Approximation query
|
||||||
|
if (numClauses > fuzziness.asDistance(searchTerm)) {
|
||||||
|
// Useful accelerant - set min should match based on number of permitted edits.
|
||||||
|
ngramBuilder.setMinimumNumberShouldMatch(numClauses - fuzziness.asDistance(searchTerm));
|
||||||
|
approxBuilder.add(ngramBuilder.build(), Occur.MUST);
|
||||||
|
}
|
||||||
|
|
||||||
|
BooleanQuery ngramQ = approxBuilder.build();
|
||||||
|
if (ngramQ.clauses().size()>0) {
|
||||||
|
bqBuilder.add(ngramQ, Occur.MUST);
|
||||||
|
}
|
||||||
|
|
||||||
|
Supplier <Automaton> deferredAutomatonSupplier = ()->{
|
||||||
|
// Verification query
|
||||||
|
FuzzyQuery fq = new FuzzyQuery(
|
||||||
|
new Term(name(), searchTerm),
|
||||||
|
fuzziness.asDistance(searchTerm),
|
||||||
|
prefixLength,
|
||||||
|
maxExpansions,
|
||||||
|
transpositions
|
||||||
|
);
|
||||||
|
return fq.getAutomata().automaton;
|
||||||
|
};
|
||||||
|
bqBuilder.add(new AutomatonQueryOnBinaryDv(name(), searchTerm, deferredAutomatonSupplier), Occur.MUST);
|
||||||
|
|
||||||
|
return bqBuilder.build();
|
||||||
|
} catch (IOException ioe) {
|
||||||
|
throw new ElasticsearchParseException("Error parsing wildcard field fuzzy string [" + searchTerm + "]");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -568,7 +900,10 @@ public class WildcardFieldMapper extends FieldMapper {
|
||||||
if (value == null || value.length() > ignoreAbove) {
|
if (value == null || value.length() > ignoreAbove) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
String ngramValue = TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
|
// Always lower case the ngram index and value - helps with
|
||||||
|
// a) speed (less ngram variations to explore on disk and in RAM-based automaton) and
|
||||||
|
// b) uses less disk space
|
||||||
|
String ngramValue = addLineEndChars(WildcardFieldType.toLowerCase(value));
|
||||||
Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType);
|
Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType);
|
||||||
fields.add(ngramField);
|
fields.add(ngramField);
|
||||||
|
|
||||||
|
@ -580,6 +915,11 @@ public class WildcardFieldMapper extends FieldMapper {
|
||||||
dvField.add(value.getBytes(StandardCharsets.UTF_8));
|
dvField.add(value.getBytes(StandardCharsets.UTF_8));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Values held in the ngram index are encoded with special characters to denote start and end of values.
|
||||||
|
static String addLineEndChars(String value) {
|
||||||
|
return TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String contentType() {
|
protected String contentType() {
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
|
|
||||||
package org.elasticsearch.xpack.wildcard.mapper;
|
package org.elasticsearch.xpack.wildcard.mapper;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.core.KeywordAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||||
|
@ -15,20 +16,31 @@ import org.apache.lucene.index.IndexWriterConfig;
|
||||||
import org.apache.lucene.index.IndexableField;
|
import org.apache.lucene.index.IndexableField;
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.queryparser.classic.ParseException;
|
||||||
|
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||||
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
import org.apache.lucene.search.BooleanClause.Occur;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
import org.apache.lucene.search.DocValuesFieldExistsQuery;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||||
|
import org.apache.lucene.search.MatchNoDocsQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.search.Sort;
|
import org.apache.lucene.search.Sort;
|
||||||
import org.apache.lucene.search.SortField;
|
import org.apache.lucene.search.SortField;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.TopDocs;
|
import org.apache.lucene.search.TopDocs;
|
||||||
import org.apache.lucene.search.WildcardQuery;
|
import org.apache.lucene.search.WildcardQuery;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
|
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||||
|
import org.apache.lucene.util.automaton.RegExp;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
import org.elasticsearch.cluster.metadata.IndexMetadata;
|
import org.elasticsearch.cluster.metadata.IndexMetadata;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.common.unit.Fuzziness;
|
||||||
import org.elasticsearch.common.util.BigArrays;
|
import org.elasticsearch.common.util.BigArrays;
|
||||||
import org.elasticsearch.index.Index;
|
import org.elasticsearch.index.Index;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
@ -55,12 +67,22 @@ import java.util.HashSet;
|
||||||
import java.util.function.BiFunction;
|
import java.util.function.BiFunction;
|
||||||
|
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
public class WildcardFieldMapperTests extends ESTestCase {
|
public class WildcardFieldMapperTests extends ESTestCase {
|
||||||
|
|
||||||
|
static QueryShardContext createMockQueryShardContext(boolean allowExpensiveQueries) {
|
||||||
|
QueryShardContext queryShardContext = mock(QueryShardContext.class);
|
||||||
|
when(queryShardContext.allowExpensiveQueries()).thenReturn(allowExpensiveQueries);
|
||||||
|
return queryShardContext;
|
||||||
|
}
|
||||||
|
|
||||||
private static final String KEYWORD_FIELD_NAME = "keyword_field";
|
private static final String KEYWORD_FIELD_NAME = "keyword_field";
|
||||||
private static final String WILDCARD_FIELD_NAME = "wildcard_field";
|
private static final String WILDCARD_FIELD_NAME = "wildcard_field";
|
||||||
static final int MAX_FIELD_LENGTH = 100;
|
public static final QueryShardContext MOCK_QSC = createMockQueryShardContext(true);
|
||||||
|
|
||||||
|
static final int MAX_FIELD_LENGTH = 30;
|
||||||
static WildcardFieldMapper wildcardFieldType;
|
static WildcardFieldMapper wildcardFieldType;
|
||||||
static KeywordFieldMapper keywordFieldType;
|
static KeywordFieldMapper keywordFieldType;
|
||||||
|
|
||||||
|
@ -136,11 +158,18 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
||||||
IndexSearcher searcher = newSearcher(reader);
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
iw.close();
|
iw.close();
|
||||||
|
|
||||||
|
// Test wildcard query
|
||||||
String queryString = randomABString((BooleanQuery.getMaxClauseCount() * 2) + 1);
|
String queryString = randomABString((BooleanQuery.getMaxClauseCount() * 2) + 1);
|
||||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(queryString, null, null);
|
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(queryString, null, null);
|
||||||
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
|
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
|
||||||
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
|
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
|
||||||
|
|
||||||
|
// Test regexp query
|
||||||
|
wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(queryString, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||||
|
wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.INDEXORDER);
|
||||||
|
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(0L));
|
||||||
|
|
||||||
|
|
||||||
reader.close();
|
reader.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
@ -181,15 +210,59 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
||||||
|
|
||||||
int numSearches = 100;
|
int numSearches = 100;
|
||||||
for (int i = 0; i < numSearches; i++) {
|
for (int i = 0; i < numSearches; i++) {
|
||||||
String randomWildcardPattern = getRandomWildcardPattern();
|
|
||||||
|
|
||||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(randomWildcardPattern, null, null);
|
Query wildcardFieldQuery = null;
|
||||||
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.INDEXORDER);
|
Query keywordFieldQuery = null;
|
||||||
|
String pattern = null;
|
||||||
Query keywordFieldQuery = new WildcardQuery(new Term(KEYWORD_FIELD_NAME, randomWildcardPattern));
|
switch (randomInt(3)) {
|
||||||
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.INDEXORDER);
|
case 0:
|
||||||
|
pattern = getRandomWildcardPattern();
|
||||||
assertThat(kwTopDocs.totalHits.value, equalTo(wildcardFieldTopDocs.totalHits.value));
|
wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
||||||
|
keywordFieldQuery = keywordFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
pattern = getRandomRegexPattern(values);
|
||||||
|
wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(pattern, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||||
|
keywordFieldQuery = keywordFieldType.fieldType().regexpQuery(pattern, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
pattern = randomABString(5);
|
||||||
|
wildcardFieldQuery = wildcardFieldType.fieldType().prefixQuery(pattern, null, MOCK_QSC);
|
||||||
|
keywordFieldQuery = keywordFieldType.fieldType().prefixQuery(pattern, null, MOCK_QSC);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
int edits = randomInt(2);
|
||||||
|
int prefixLength = randomInt(4);
|
||||||
|
pattern = getRandomFuzzyPattern(values, edits, prefixLength);
|
||||||
|
Fuzziness fuzziness = Fuzziness.AUTO;
|
||||||
|
switch (edits) {
|
||||||
|
case 0:
|
||||||
|
fuzziness = Fuzziness.ZERO;
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
fuzziness = Fuzziness.ONE;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
fuzziness = Fuzziness.TWO;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Prefix length shouldn't be longer than selected search string
|
||||||
|
// BUT keyword field has a bug with prefix length when equal - see https://github.com/elastic/elasticsearch/issues/55790
|
||||||
|
// so we opt for one less
|
||||||
|
prefixLength = Math.min(pattern.length() - 1 , prefixLength);
|
||||||
|
boolean transpositions = randomBoolean();
|
||||||
|
|
||||||
|
wildcardFieldQuery = wildcardFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50,
|
||||||
|
transpositions, MOCK_QSC);
|
||||||
|
keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50,
|
||||||
|
transpositions, MOCK_QSC);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE);
|
||||||
|
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE);
|
||||||
|
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value));
|
||||||
|
|
||||||
HashSet<Integer> expectedDocs = new HashSet<>();
|
HashSet<Integer> expectedDocs = new HashSet<>();
|
||||||
for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
|
for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
|
||||||
|
@ -201,7 +274,6 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
||||||
assertThat(expectedDocs.size(), equalTo(0));
|
assertThat(expectedDocs.size(), equalTo(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//Test keyword and wildcard sort operations are also equivalent
|
//Test keyword and wildcard sort operations are also equivalent
|
||||||
QueryShardContext shardContextMock = createMockShardContext();
|
QueryShardContext shardContextMock = createMockShardContext();
|
||||||
|
|
||||||
|
@ -221,8 +293,334 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
||||||
reader.close();
|
reader.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testRegexAcceleration() throws IOException, ParseException {
|
||||||
|
// All these expressions should rewrite to a match all with no verification step required at all
|
||||||
|
String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"};
|
||||||
|
for (String regex : superfastRegexes) {
|
||||||
|
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||||
|
assertTrue(wildcardFieldQuery instanceof DocValuesFieldExistsQuery);
|
||||||
|
}
|
||||||
|
String matchNoDocsRegexes[]= { ""};
|
||||||
|
for (String regex : matchNoDocsRegexes) {
|
||||||
|
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||||
|
assertTrue(wildcardFieldQuery instanceof MatchNoDocsQuery);
|
||||||
|
}
|
||||||
|
|
||||||
|
// All of these regexes should be accelerated as the equivalent of the given QueryString query
|
||||||
|
String acceleratedTests[][] = {
|
||||||
|
{".*foo.*", "foo"},
|
||||||
|
{"..foobar","+foo +oba +ar_ +r__"},
|
||||||
|
{"(maynotexist)?foobar","+foo +oba +ar_ +r__"},
|
||||||
|
{".*/etc/passw.*", "+\\/et +tc\\/ +\\/pa +ass +ssw"},
|
||||||
|
{".*etc/passwd", "+etc +c\\/p +pas +ssw +wd_ +d__"},
|
||||||
|
{"(http|ftp)://foo.*", "+((+htt +ttp) ftp) +(+\\:\\/\\/ +\\/fo +foo)"},
|
||||||
|
{"[Pp][Oo][Ww][Ee][Rr][Ss][Hh][Ee][Ll][Ll]\\.[Ee][Xx][Ee]", "+_po +owe +ers +she +ell +l\\.e +exe +e__"},
|
||||||
|
{"foo<1-100>bar", "+(+_fo +foo) +(+bar +r__ )"},
|
||||||
|
{"(aaa.+&.+bbb)cat", "+cat +t__"},
|
||||||
|
{".a", "a__"}
|
||||||
|
};
|
||||||
|
for (String[] test : acceleratedTests) {
|
||||||
|
String regex = test[0];
|
||||||
|
String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
|
||||||
|
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||||
|
testExpectedAccelerationQuery(regex, wildcardFieldQuery, expectedAccelerationQueryString);
|
||||||
|
}
|
||||||
|
|
||||||
|
// All these expressions should rewrite to just the verification query (there's no ngram acceleration)
|
||||||
|
// TODO we can possibly improve on some of these
|
||||||
|
String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb"};
|
||||||
|
for (String regex : matchAllButVerifyTests) {
|
||||||
|
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||||
|
assertTrue(regex +" was not a pure verify query " +formatQuery(wildcardFieldQuery),
|
||||||
|
wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Documentation - regexes that do try accelerate but we would like to improve in future versions.
|
||||||
|
String suboptimalTests[][] = {
|
||||||
|
// TODO short wildcards like a* OR b* aren't great so we just drop them.
|
||||||
|
// Ideally we would attach to successors to create (acd OR bcd)
|
||||||
|
{ "[ab]cd", "+cd_ +d__"}
|
||||||
|
};
|
||||||
|
for (String[] test : suboptimalTests) {
|
||||||
|
String regex = test[0];
|
||||||
|
String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
|
||||||
|
Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 20000, null, MOCK_QSC);
|
||||||
|
|
||||||
|
testExpectedAccelerationQuery(regex, wildcardFieldQuery, expectedAccelerationQueryString);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
// Make error messages more readable
|
||||||
|
String formatQuery(Query q) {
|
||||||
|
return q.toString().replaceAll(WILDCARD_FIELD_NAME+":", "").replaceAll(WildcardFieldMapper.TOKEN_START_STRING, "_");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWildcardAcceleration() throws IOException, ParseException {
|
||||||
|
|
||||||
|
// All these expressions should rewrite to MatchAll with no verification step required at all
|
||||||
|
String superfastPattern[] = { "*", "**", "*?" };
|
||||||
|
for (String pattern : superfastPattern) {
|
||||||
|
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
||||||
|
assertTrue(
|
||||||
|
pattern + " was not a pure match all query " + formatQuery(wildcardFieldQuery),
|
||||||
|
wildcardFieldQuery instanceof DocValuesFieldExistsQuery
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// All of these patterns should be accelerated.
|
||||||
|
String tests[][] = {
|
||||||
|
{ "*foobar", "+foo +oba +ar_ +r__" },
|
||||||
|
{ "foobar*", "+_fo +oob +bar" },
|
||||||
|
{ "foo\\*bar*", "+_fo +oo\\* +\\*ba +bar" },
|
||||||
|
{ "foo\\?bar*", "+_fo +oo\\? +\\?ba +bar" },
|
||||||
|
{ "foo*bar", "+_fo +foo +bar +r__" },
|
||||||
|
{ "foo?bar", "+_fo +foo +bar +r__" },
|
||||||
|
{ "?foo*bar?", "+foo +bar" },
|
||||||
|
{ "*c", "+c__" } };
|
||||||
|
for (String[] test : tests) {
|
||||||
|
String pattern = test[0];
|
||||||
|
String expectedAccelerationQueryString = test[1].replaceAll("_", "" + WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
|
||||||
|
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
||||||
|
testExpectedAccelerationQuery(pattern, wildcardFieldQuery, expectedAccelerationQueryString);
|
||||||
|
assertTrue(wildcardFieldQuery instanceof BooleanQuery);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO All these expressions have no acceleration at all and could be improved
|
||||||
|
String slowPatterns[] = { "??" };
|
||||||
|
for (String pattern : slowPatterns) {
|
||||||
|
Query wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
||||||
|
assertTrue(
|
||||||
|
pattern + " was not as slow as we assumed " + formatQuery(wildcardFieldQuery),
|
||||||
|
wildcardFieldQuery instanceof AutomatonQueryOnBinaryDv
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static class FuzzyTest {
|
||||||
|
String pattern;
|
||||||
|
int prefixLength;
|
||||||
|
Fuzziness fuzziness;
|
||||||
|
String expectedPrefixQuery;
|
||||||
|
int expectedMinShouldMatch;
|
||||||
|
String ngrams;
|
||||||
|
|
||||||
|
FuzzyTest(
|
||||||
|
String pattern,
|
||||||
|
int prefixLength,
|
||||||
|
Fuzziness fuzziness,
|
||||||
|
String expectedPrefixQuery,
|
||||||
|
int expectedMinShouldMatch,
|
||||||
|
String ngrams
|
||||||
|
) {
|
||||||
|
super();
|
||||||
|
this.pattern = pattern;
|
||||||
|
this.prefixLength = prefixLength;
|
||||||
|
this.fuzziness = fuzziness;
|
||||||
|
this.expectedPrefixQuery = expectedPrefixQuery;
|
||||||
|
this.expectedMinShouldMatch = expectedMinShouldMatch;
|
||||||
|
this.ngrams = ngrams;
|
||||||
|
}
|
||||||
|
|
||||||
|
Query getFuzzyQuery() {
|
||||||
|
return wildcardFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50, true, MOCK_QSC);
|
||||||
|
}
|
||||||
|
|
||||||
|
Query getExpectedApproxQuery() throws ParseException {
|
||||||
|
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||||
|
if (expectedPrefixQuery != null) {
|
||||||
|
String[] tokens = expectedPrefixQuery.split(" ");
|
||||||
|
Query prefixQuery = null;
|
||||||
|
if (tokens.length == 1) {
|
||||||
|
prefixQuery = new TermQuery(
|
||||||
|
new Term(WILDCARD_FIELD_NAME, tokens[0].replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
BooleanQuery.Builder pqb = new BooleanQuery.Builder();
|
||||||
|
for (String token : tokens) {
|
||||||
|
Query ngramQuery = new TermQuery(
|
||||||
|
new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
|
||||||
|
);
|
||||||
|
pqb.add(ngramQuery, Occur.MUST);
|
||||||
|
}
|
||||||
|
prefixQuery = pqb.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ngrams == null) {
|
||||||
|
return prefixQuery;
|
||||||
|
}
|
||||||
|
bq.add(prefixQuery, Occur.MUST);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ngrams != null) {
|
||||||
|
BooleanQuery.Builder nq = new BooleanQuery.Builder();
|
||||||
|
String[] tokens = ngrams.split(" ");
|
||||||
|
for (String token : tokens) {
|
||||||
|
Query ngramQuery = new TermQuery(
|
||||||
|
new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
|
||||||
|
);
|
||||||
|
nq.add(ngramQuery, Occur.SHOULD);
|
||||||
|
}
|
||||||
|
nq.setMinimumNumberShouldMatch(expectedMinShouldMatch);
|
||||||
|
bq.add(nq.build(), Occur.MUST);
|
||||||
|
}
|
||||||
|
return bq.build();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFuzzyAcceleration() throws IOException, ParseException {
|
||||||
|
|
||||||
|
FuzzyTest[] tests = {
|
||||||
|
new FuzzyTest("123456", 0, Fuzziness.ONE, null, 1, "123 456"),
|
||||||
|
new FuzzyTest("1234567890", 2, Fuzziness.ONE, "_12", 1, "345 678"),
|
||||||
|
new FuzzyTest("12345678901", 2, Fuzziness.ONE, "_12", 2, "345 678 901"),
|
||||||
|
new FuzzyTest("12345678", 4, Fuzziness.ONE, "_12 234", 0, null)
|
||||||
|
};
|
||||||
|
for (FuzzyTest test : tests) {
|
||||||
|
Query wildcardFieldQuery = test.getFuzzyQuery();
|
||||||
|
testExpectedAccelerationQuery(test.pattern, wildcardFieldQuery, test.getExpectedApproxQuery());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {
|
||||||
|
|
||||||
|
QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
|
||||||
|
Query expectedAccelerationQuery = qsp.parse(expectedAccelerationQueryString);
|
||||||
|
testExpectedAccelerationQuery(regex, combinedQuery, expectedAccelerationQuery);
|
||||||
|
}
|
||||||
|
void testExpectedAccelerationQuery(String regex, Query combinedQuery, Query expectedAccelerationQuery) throws ParseException {
|
||||||
|
BooleanQuery cq = (BooleanQuery) combinedQuery;
|
||||||
|
assert cq.clauses().size() == 2;
|
||||||
|
Query approximationQuery = null;
|
||||||
|
boolean verifyQueryFound = false;
|
||||||
|
for (BooleanClause booleanClause : cq.clauses()) {
|
||||||
|
Query q = booleanClause.getQuery();
|
||||||
|
if (q instanceof AutomatonQueryOnBinaryDv) {
|
||||||
|
verifyQueryFound = true;
|
||||||
|
} else {
|
||||||
|
approximationQuery = q;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert verifyQueryFound;
|
||||||
|
|
||||||
|
String message = "regex: "+ regex +"\nactual query: " + formatQuery(approximationQuery) +
|
||||||
|
"\nexpected query: " + formatQuery(expectedAccelerationQuery) + "\n";
|
||||||
|
assertEquals(message, expectedAccelerationQuery, approximationQuery);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getRandomFuzzyPattern(HashSet<String> values, int edits, int prefixLength) {
|
||||||
|
assert edits >=0 && edits <=2;
|
||||||
|
// Pick one of the indexed document values to focus our queries on.
|
||||||
|
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
|
||||||
|
|
||||||
|
if (edits == 0) {
|
||||||
|
return randomValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (randomValue.length() > prefixLength) {
|
||||||
|
randomValue = randomValue.substring(0,prefixLength) + "C" + randomValue.substring(prefixLength);
|
||||||
|
edits--;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(edits > 0) {
|
||||||
|
randomValue = randomValue + "a";
|
||||||
|
}
|
||||||
|
return randomValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getRandomRegexPattern(HashSet<String> values) {
|
||||||
|
// Pick one of the indexed document values to focus our queries on.
|
||||||
|
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
|
||||||
|
return convertToRandomRegex(randomValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Produces a random regex string guaranteed to match the provided value
|
||||||
|
protected String convertToRandomRegex(String randomValue) {
|
||||||
|
StringBuilder result = new StringBuilder();
|
||||||
|
//Pick a part of the string to change
|
||||||
|
int substitutionPoint = randomIntBetween(0, randomValue.length()-1);
|
||||||
|
int substitutionLength = randomIntBetween(1, Math.min(10, randomValue.length() - substitutionPoint));
|
||||||
|
|
||||||
|
//Add any head to the result, unchanged
|
||||||
|
if(substitutionPoint >0) {
|
||||||
|
result.append(randomValue.substring(0,substitutionPoint));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Modify the middle...
|
||||||
|
String replacementPart = randomValue.substring(substitutionPoint, substitutionPoint+substitutionLength);
|
||||||
|
int mutation = randomIntBetween(0, 11);
|
||||||
|
switch (mutation) {
|
||||||
|
case 0:
|
||||||
|
// OR with random alpha of same length
|
||||||
|
result.append("("+replacementPart+"|c"+ randomABString(replacementPart.length())+")");
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
// OR with non-existant value
|
||||||
|
result.append("("+replacementPart+"|doesnotexist)");
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
// OR with another randomised regex (used to create nested levels of expression).
|
||||||
|
result.append("(" + convertToRandomRegex(replacementPart) +"|doesnotexist)");
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
// Star-replace all ab sequences.
|
||||||
|
result.append(replacementPart.replaceAll("ab", ".*"));
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
// .-replace all b chars
|
||||||
|
result.append(replacementPart.replaceAll("b", "."));
|
||||||
|
break;
|
||||||
|
case 5:
|
||||||
|
// length-limited stars {1,2}
|
||||||
|
result.append(".{1,"+replacementPart.length()+"}");
|
||||||
|
break;
|
||||||
|
case 6:
|
||||||
|
// replace all chars with .
|
||||||
|
result.append(replacementPart.replaceAll(".", "."));
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
// OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
|
||||||
|
char [] chars = replacementPart.toCharArray();
|
||||||
|
for (char c : chars) {
|
||||||
|
result.append("[" + c + Character.toUpperCase(c) +"]");
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 8:
|
||||||
|
// NOT a character - replace all b's with "not a"
|
||||||
|
result.append(replacementPart.replaceAll("b", "[^a]"));
|
||||||
|
break;
|
||||||
|
case 9:
|
||||||
|
// Make whole part repeatable 1 or more times
|
||||||
|
result.append("(" + replacementPart +")+");
|
||||||
|
break;
|
||||||
|
case 10:
|
||||||
|
// Make whole part repeatable 0 or more times
|
||||||
|
result.append("(" + replacementPart +")?");
|
||||||
|
break;
|
||||||
|
case 11:
|
||||||
|
// all but ... syntax
|
||||||
|
result.append("@&~(doesnotexist.+)");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
//add any remaining tail, unchanged
|
||||||
|
if(substitutionPoint + substitutionLength <= randomValue.length()-1) {
|
||||||
|
result.append(randomValue.substring(substitutionPoint + substitutionLength));
|
||||||
|
}
|
||||||
|
|
||||||
|
//Assert our randomly generated regex actually matches the provided raw input.
|
||||||
|
RegExp regex = new RegExp(result.toString());
|
||||||
|
Automaton automaton = regex.toAutomaton();
|
||||||
|
ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
|
||||||
|
BytesRef br = new BytesRef(randomValue);
|
||||||
|
assertTrue("[" + result.toString() + "]should match [" + randomValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
|
||||||
|
+ randomValue.length(), bytesMatcher.run(br.bytes, br.offset, br.length));
|
||||||
|
return result.toString();
|
||||||
|
}
|
||||||
|
|
||||||
protected MappedFieldType provideMappedFieldType(String name) {
|
protected MappedFieldType provideMappedFieldType(String name) {
|
||||||
if (name.equals(WILDCARD_FIELD_NAME)) {
|
if (name.equals(WILDCARD_FIELD_NAME)) {
|
||||||
|
@ -284,7 +682,11 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
while (sb.length() < minLength) {
|
while (sb.length() < minLength) {
|
||||||
if (randomBoolean()) {
|
if (randomBoolean()) {
|
||||||
sb.append("a");
|
if (randomBoolean()) {
|
||||||
|
sb.append("a");
|
||||||
|
} else {
|
||||||
|
sb.append("A");
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
sb.append("b");
|
sb.append("b");
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue