Search - add range query support to wildcard field (#57881) (#57988)

Backport to add range query support to wildcard field

Closes #57816
This commit is contained in:
markharwood 2020-06-12 11:30:54 +01:00 committed by GitHub
parent db03e7c93b
commit 2da8e57f59
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 250 additions and 1 deletions

View File

@ -30,15 +30,19 @@ import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.automaton.RegExp.Kind; import org.apache.lucene.util.automaton.RegExp.Kind;
import org.elasticsearch.ElasticsearchException; import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.geo.ShapeRelation;
import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.time.DateMathParser;
import org.elasticsearch.common.unit.Fuzziness; import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentParser;
@ -70,6 +74,7 @@ import org.elasticsearch.search.aggregations.support.ValuesSourceType;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.time.ZoneId;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
@ -614,6 +619,12 @@ public class WildcardFieldMapper extends FieldMapper {
return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery; return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
} }
protected String firstNgramToken(String fragment) {
LinkedHashSet<String> tokens = new LinkedHashSet<>();
getNgramTokens(tokens, fragment);
return tokens.iterator().next();
}
protected void getNgramTokens(Set<String> tokens, String fragment) { protected void getNgramTokens(Set<String> tokens, String fragment) {
if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) { if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
// If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search // If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search
@ -678,6 +689,90 @@ public class WildcardFieldMapper extends FieldMapper {
} }
} }
@Override
public Query rangeQuery(
Object lowerTerm,
Object upperTerm,
boolean includeLower,
boolean includeUpper,
ShapeRelation relation,
ZoneId timeZone,
DateMathParser parser,
QueryShardContext context
) {
if (context.allowExpensiveQueries() == false) {
throw new ElasticsearchException("[range] queries on [wildcard] fields cannot be executed when '" +
ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false.");
}
BytesRef lower = lowerTerm == null ? null : BytesRefs.toBytesRef(lowerTerm);
BytesRef upper = upperTerm == null ? null : BytesRefs.toBytesRef(upperTerm);
Query accelerationQuery = null;
if (lowerTerm != null && upperTerm != null) {
// Long common prefixes e.g. "C:/Program Files/a,txt" to "C:/Program Files/z,txt"
// can be accelerated by searching for all the common leading ngrams e.g. c:/, /pr, rog, gra etc
StringBuilder commonPrefix = new StringBuilder();
String lowerS = addLineEndChars(toLowerCase(lower.utf8ToString()));
String upperS = addLineEndChars(toLowerCase(upper.utf8ToString()));
for (int i = 0; i < Math.min(lowerS.length(), upperS.length());) {
final int cL = lowerS.codePointAt(i);
final int cU = upperS.codePointAt(i);
if (cL == cU) {
commonPrefix.append(Character.toChars(cL));
} else {
break;
}
int length = Character.charCount(cL);
i += length;
}
if (commonPrefix.length() > 0) {
Set<String> tokens = new HashSet<>();
getNgramTokens(tokens, commonPrefix.toString());
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
for (String token : tokens) {
int tokenSize = token.codePointCount(0, token.length());
if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
continue;
}
if (tokenSize == NGRAM_SIZE) {
TermQuery tq = new TermQuery(new Term(name(), token));
bqBuilder.add(new BooleanClause(tq, Occur.MUST));
} else {
PrefixQuery wq = new PrefixQuery(new Term(name(), token));
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
bqBuilder.add(new BooleanClause(wq, Occur.MUST));
}
}
BooleanQuery bq = bqBuilder.build();
if (bq.clauses().size() > 0) {
accelerationQuery = bq;
}
}
}
if (accelerationQuery == null) {
// Fallback - if there is no common prefix sequence then we look for the range of ngrams that appear at the start
// of the string e.g. given 100 to 999 we would search for ngrams in the range
// TOKEN_START_OR_END_CHAR + "10" to
// TOKEN_START_OR_END_CHAR + "99"
BytesRef lowerNgram = lower == null ? null : new BytesRef(firstNgramToken(
addLineEndChars(toLowerCase(lower.utf8ToString()))));
BytesRef upperNgram = upper == null ? null : new BytesRef(firstNgramToken(
addLineEndChars(toLowerCase(upper.utf8ToString()))));
accelerationQuery = new TermRangeQuery(name(), lowerNgram, upperNgram, true, true);
}
Supplier <Automaton> deferredAutomatonSupplier = ()->{
return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
};
AutomatonQueryOnBinaryDv slowQuery = new AutomatonQueryOnBinaryDv(name(), lower + "-" + upper, deferredAutomatonSupplier);
BooleanQuery.Builder qBuilder = new BooleanQuery.Builder();
qBuilder.add(accelerationQuery, Occur.MUST);
qBuilder.add(slowQuery, Occur.MUST);
return qBuilder.build();
}
@Override @Override
public Query fuzzyQuery( public Query fuzzyQuery(
Object value, Object value,

View File

@ -30,6 +30,7 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort; import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
@ -214,7 +215,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
Query wildcardFieldQuery = null; Query wildcardFieldQuery = null;
Query keywordFieldQuery = null; Query keywordFieldQuery = null;
String pattern = null; String pattern = null;
switch (randomInt(3)) { switch (randomInt(4)) {
case 0: case 0:
pattern = getRandomWildcardPattern(); pattern = getRandomWildcardPattern();
wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC); wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
@ -259,6 +260,14 @@ public class WildcardFieldMapperTests extends ESTestCase {
keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50, keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50,
transpositions, MOCK_QSC); transpositions, MOCK_QSC);
break; break;
case 4:
TermRangeQuery trq = getRandomRange(values);
wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
trq.includesUpper(), null, null, null, MOCK_QSC);
keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
trq.includesUpper(), null, null, null, MOCK_QSC);
break;
} }
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE); TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE); TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE);
@ -294,6 +303,76 @@ public class WildcardFieldMapperTests extends ESTestCase {
dir.close(); dir.close();
} }
private void indexDoc(RandomIndexWriter iw, String value) throws IOException {
Document doc = new Document();
ParseContext.Document parseDoc = new ParseContext.Document();
addFields(parseDoc, doc, value);
indexDoc(parseDoc, doc, iw);
}
public void testRangeQueryVersusKeywordField() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
// Tests for acceleration strategy based on long common prefix
indexDoc(iw, "C:\\Program Files\\a.txt");
indexDoc(iw, "C:\\Program Files\\n.txt");
indexDoc(iw, "C:\\Program Files\\z.txt");
// Tests for acceleration strategy based on no common prefix
indexDoc(iw, "a.txt");
indexDoc(iw, "n.txt");
indexDoc(iw, "z.txt");
iw.forceMerge(1);
DirectoryReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();
String [][] rangeTests = {
{"C:\\Program Files\\a", "C:\\Program Files\\z"},
{"C:\\Program Files\\a", "C:\\Program Files\\n"},
{null, "C:\\Program Files\\z"},
{"C:\\Program Files\\a", null},
{"a.txt", "z.txt"},
{"a.txt", "n.txt"},
{null, "z.txt"},
{"a.txt", null}
};
for (String[] bounds : rangeTests) {
BytesRef lower = bounds[0] == null ? null :new BytesRef(bounds[0]);
BytesRef upper = bounds[1] == null ? null :new BytesRef(bounds[1]);
TermRangeQuery trq = new TermRangeQuery(WILDCARD_FIELD_NAME, lower, upper, randomBoolean(), randomBoolean());
Query wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
trq.includesUpper(), null, null, null, MOCK_QSC);
Query keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
trq.includesUpper(), null, null, null, MOCK_QSC);
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, 10, Sort.RELEVANCE);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.RELEVANCE);
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value));
HashSet<Integer> expectedDocs = new HashSet<>();
for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
expectedDocs.add(topDoc.doc);
}
for (ScoreDoc wcTopDoc : wildcardFieldTopDocs.scoreDocs) {
assertTrue(expectedDocs.remove(wcTopDoc.doc));
}
assertThat(expectedDocs.size(), equalTo(0));
}
reader.close();
dir.close();
}
public void testRegexAcceleration() throws IOException, ParseException { public void testRegexAcceleration() throws IOException, ParseException {
// All these expressions should rewrite to a match all with no verification step required at all // All these expressions should rewrite to a match all with no verification step required at all
String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"}; String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"};
@ -485,6 +564,54 @@ public class WildcardFieldMapperTests extends ESTestCase {
} }
} }
static class RangeTest {
String lower;
String upper;
String ngrams;
RangeTest(
String lower,
String upper,
String ngrams
) {
super();
this.lower = lower;
this.upper = upper;
this.ngrams = ngrams;
}
Query getRangeQuery() {
return wildcardFieldType.fieldType().rangeQuery(lower, upper, true, true, null, null, null, MOCK_QSC);
}
Query getExpectedApproxQuery() throws ParseException {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
if (ngrams != null) {
String[] tokens = ngrams.split(" ");
for (String token : tokens) {
Query ngramQuery = new TermQuery(
new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
);
bq.add(ngramQuery, Occur.MUST);
}
}
return bq.build();
}
}
public void testRangeAcceleration() throws IOException, ParseException {
RangeTest[] tests = {
new RangeTest("c:/a.txt", "c:/z.txt", "_c: c:/"),
new RangeTest("C:/ProgramFiles/a.txt", "C:/ProgramFiles/z.txt", "_c: :/p pro ogr ram mfi ile es/"),
};
for (RangeTest test : tests) {
Query wildcardFieldQuery = test.getRangeQuery();
testExpectedAccelerationQuery(test.lower + "-" + test.upper, wildcardFieldQuery, test.getExpectedApproxQuery());
}
}
void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException { void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {
QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer()); QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
@ -531,6 +658,33 @@ public class WildcardFieldMapperTests extends ESTestCase {
return randomValue; return randomValue;
} }
private TermRangeQuery getRandomRange(HashSet<String> values) {
// Pick one of the indexed document values to focus our queries on.
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
StringBuilder upper = new StringBuilder();
//Pick a part of the string to change
int substitutionPoint = randomIntBetween(0, randomValue.length()-1);
int substitutionLength = randomIntBetween(1, Math.min(10, randomValue.length() - substitutionPoint));
//Add any head to the result, unchanged
if(substitutionPoint >0) {
upper.append(randomValue.substring(0,substitutionPoint));
}
// Modify the middle...
String replacementPart = randomValue.substring(substitutionPoint, substitutionPoint+substitutionLength);
// .-replace all a chars with z
upper.append(replacementPart.replaceAll("a", "z"));
//add any remaining tail, unchanged
if(substitutionPoint + substitutionLength <= randomValue.length()-1) {
upper.append(randomValue.substring(substitutionPoint + substitutionLength));
}
return new TermRangeQuery(WILDCARD_FIELD_NAME, new BytesRef(randomValue), new BytesRef(upper.toString()),
randomBoolean(), randomBoolean());
}
private String getRandomRegexPattern(HashSet<String> values) { private String getRandomRegexPattern(HashSet<String> values) {
// Pick one of the indexed document values to focus our queries on. // Pick one of the indexed document values to focus our queries on.
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)]; String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];