Backport to add range query support to wildcard field Closes #57816
This commit is contained in:
parent
db03e7c93b
commit
2da8e57f59
|
@ -30,15 +30,19 @@ import org.apache.lucene.search.PrefixQuery;
|
|||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
import org.apache.lucene.util.automaton.RegExp.Kind;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.ElasticsearchParseException;
|
||||
import org.elasticsearch.common.geo.ShapeRelation;
|
||||
import org.elasticsearch.common.lucene.BytesRefs;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.time.DateMathParser;
|
||||
import org.elasticsearch.common.unit.Fuzziness;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
|
@ -70,6 +74,7 @@ import org.elasticsearch.search.aggregations.support.ValuesSourceType;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.ZoneId;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
@ -614,6 +619,12 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
|
||||
}
|
||||
|
||||
protected String firstNgramToken(String fragment) {
|
||||
LinkedHashSet<String> tokens = new LinkedHashSet<>();
|
||||
getNgramTokens(tokens, fragment);
|
||||
return tokens.iterator().next();
|
||||
}
|
||||
|
||||
protected void getNgramTokens(Set<String> tokens, String fragment) {
|
||||
if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
|
||||
// If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search
|
||||
|
@ -678,6 +689,90 @@ public class WildcardFieldMapper extends FieldMapper {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rangeQuery(
|
||||
Object lowerTerm,
|
||||
Object upperTerm,
|
||||
boolean includeLower,
|
||||
boolean includeUpper,
|
||||
ShapeRelation relation,
|
||||
ZoneId timeZone,
|
||||
DateMathParser parser,
|
||||
QueryShardContext context
|
||||
) {
|
||||
if (context.allowExpensiveQueries() == false) {
|
||||
throw new ElasticsearchException("[range] queries on [wildcard] fields cannot be executed when '" +
|
||||
ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false.");
|
||||
}
|
||||
BytesRef lower = lowerTerm == null ? null : BytesRefs.toBytesRef(lowerTerm);
|
||||
BytesRef upper = upperTerm == null ? null : BytesRefs.toBytesRef(upperTerm);
|
||||
Query accelerationQuery = null;
|
||||
if (lowerTerm != null && upperTerm != null) {
|
||||
// Long common prefixes e.g. "C:/Program Files/a,txt" to "C:/Program Files/z,txt"
|
||||
// can be accelerated by searching for all the common leading ngrams e.g. c:/, /pr, rog, gra etc
|
||||
StringBuilder commonPrefix = new StringBuilder();
|
||||
String lowerS = addLineEndChars(toLowerCase(lower.utf8ToString()));
|
||||
String upperS = addLineEndChars(toLowerCase(upper.utf8ToString()));
|
||||
for (int i = 0; i < Math.min(lowerS.length(), upperS.length());) {
|
||||
final int cL = lowerS.codePointAt(i);
|
||||
final int cU = upperS.codePointAt(i);
|
||||
if (cL == cU) {
|
||||
commonPrefix.append(Character.toChars(cL));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
int length = Character.charCount(cL);
|
||||
i += length;
|
||||
}
|
||||
|
||||
if (commonPrefix.length() > 0) {
|
||||
Set<String> tokens = new HashSet<>();
|
||||
getNgramTokens(tokens, commonPrefix.toString());
|
||||
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
|
||||
for (String token : tokens) {
|
||||
int tokenSize = token.codePointCount(0, token.length());
|
||||
if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (tokenSize == NGRAM_SIZE) {
|
||||
TermQuery tq = new TermQuery(new Term(name(), token));
|
||||
bqBuilder.add(new BooleanClause(tq, Occur.MUST));
|
||||
} else {
|
||||
PrefixQuery wq = new PrefixQuery(new Term(name(), token));
|
||||
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
|
||||
bqBuilder.add(new BooleanClause(wq, Occur.MUST));
|
||||
}
|
||||
}
|
||||
BooleanQuery bq = bqBuilder.build();
|
||||
if (bq.clauses().size() > 0) {
|
||||
accelerationQuery = bq;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (accelerationQuery == null) {
|
||||
// Fallback - if there is no common prefix sequence then we look for the range of ngrams that appear at the start
|
||||
// of the string e.g. given 100 to 999 we would search for ngrams in the range
|
||||
// TOKEN_START_OR_END_CHAR + "10" to
|
||||
// TOKEN_START_OR_END_CHAR + "99"
|
||||
BytesRef lowerNgram = lower == null ? null : new BytesRef(firstNgramToken(
|
||||
addLineEndChars(toLowerCase(lower.utf8ToString()))));
|
||||
BytesRef upperNgram = upper == null ? null : new BytesRef(firstNgramToken(
|
||||
addLineEndChars(toLowerCase(upper.utf8ToString()))));
|
||||
accelerationQuery = new TermRangeQuery(name(), lowerNgram, upperNgram, true, true);
|
||||
}
|
||||
|
||||
Supplier <Automaton> deferredAutomatonSupplier = ()->{
|
||||
return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
|
||||
};
|
||||
AutomatonQueryOnBinaryDv slowQuery = new AutomatonQueryOnBinaryDv(name(), lower + "-" + upper, deferredAutomatonSupplier);
|
||||
|
||||
BooleanQuery.Builder qBuilder = new BooleanQuery.Builder();
|
||||
qBuilder.add(accelerationQuery, Occur.MUST);
|
||||
qBuilder.add(slowQuery, Occur.MUST);
|
||||
return qBuilder.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query fuzzyQuery(
|
||||
Object value,
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.search.ScoreDoc;
|
|||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
@ -214,7 +215,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
Query wildcardFieldQuery = null;
|
||||
Query keywordFieldQuery = null;
|
||||
String pattern = null;
|
||||
switch (randomInt(3)) {
|
||||
switch (randomInt(4)) {
|
||||
case 0:
|
||||
pattern = getRandomWildcardPattern();
|
||||
wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
||||
|
@ -259,6 +260,14 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50,
|
||||
transpositions, MOCK_QSC);
|
||||
break;
|
||||
case 4:
|
||||
TermRangeQuery trq = getRandomRange(values);
|
||||
wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
|
||||
trq.includesUpper(), null, null, null, MOCK_QSC);
|
||||
keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
|
||||
trq.includesUpper(), null, null, null, MOCK_QSC);
|
||||
break;
|
||||
|
||||
}
|
||||
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE);
|
||||
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE);
|
||||
|
@ -294,6 +303,76 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
private void indexDoc(RandomIndexWriter iw, String value) throws IOException {
|
||||
Document doc = new Document();
|
||||
ParseContext.Document parseDoc = new ParseContext.Document();
|
||||
addFields(parseDoc, doc, value);
|
||||
indexDoc(parseDoc, doc, iw);
|
||||
}
|
||||
|
||||
public void testRangeQueryVersusKeywordField() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
|
||||
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
// Tests for acceleration strategy based on long common prefix
|
||||
indexDoc(iw, "C:\\Program Files\\a.txt");
|
||||
indexDoc(iw, "C:\\Program Files\\n.txt");
|
||||
indexDoc(iw, "C:\\Program Files\\z.txt");
|
||||
|
||||
// Tests for acceleration strategy based on no common prefix
|
||||
indexDoc(iw, "a.txt");
|
||||
indexDoc(iw, "n.txt");
|
||||
indexDoc(iw, "z.txt");
|
||||
|
||||
iw.forceMerge(1);
|
||||
DirectoryReader reader = iw.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
iw.close();
|
||||
|
||||
|
||||
String [][] rangeTests = {
|
||||
{"C:\\Program Files\\a", "C:\\Program Files\\z"},
|
||||
{"C:\\Program Files\\a", "C:\\Program Files\\n"},
|
||||
{null, "C:\\Program Files\\z"},
|
||||
{"C:\\Program Files\\a", null},
|
||||
|
||||
{"a.txt", "z.txt"},
|
||||
{"a.txt", "n.txt"},
|
||||
{null, "z.txt"},
|
||||
{"a.txt", null}
|
||||
};
|
||||
|
||||
for (String[] bounds : rangeTests) {
|
||||
BytesRef lower = bounds[0] == null ? null :new BytesRef(bounds[0]);
|
||||
BytesRef upper = bounds[1] == null ? null :new BytesRef(bounds[1]);
|
||||
TermRangeQuery trq = new TermRangeQuery(WILDCARD_FIELD_NAME, lower, upper, randomBoolean(), randomBoolean());
|
||||
Query wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
|
||||
trq.includesUpper(), null, null, null, MOCK_QSC);
|
||||
Query keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
|
||||
trq.includesUpper(), null, null, null, MOCK_QSC);
|
||||
|
||||
|
||||
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, 10, Sort.RELEVANCE);
|
||||
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.RELEVANCE);
|
||||
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value));
|
||||
|
||||
HashSet<Integer> expectedDocs = new HashSet<>();
|
||||
for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
|
||||
expectedDocs.add(topDoc.doc);
|
||||
}
|
||||
for (ScoreDoc wcTopDoc : wildcardFieldTopDocs.scoreDocs) {
|
||||
assertTrue(expectedDocs.remove(wcTopDoc.doc));
|
||||
}
|
||||
assertThat(expectedDocs.size(), equalTo(0));
|
||||
|
||||
}
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
||||
public void testRegexAcceleration() throws IOException, ParseException {
|
||||
// All these expressions should rewrite to a match all with no verification step required at all
|
||||
String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"};
|
||||
|
@ -485,6 +564,54 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static class RangeTest {
|
||||
String lower;
|
||||
String upper;
|
||||
String ngrams;
|
||||
|
||||
RangeTest(
|
||||
String lower,
|
||||
String upper,
|
||||
String ngrams
|
||||
) {
|
||||
super();
|
||||
this.lower = lower;
|
||||
this.upper = upper;
|
||||
this.ngrams = ngrams;
|
||||
}
|
||||
|
||||
Query getRangeQuery() {
|
||||
return wildcardFieldType.fieldType().rangeQuery(lower, upper, true, true, null, null, null, MOCK_QSC);
|
||||
}
|
||||
|
||||
Query getExpectedApproxQuery() throws ParseException {
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
if (ngrams != null) {
|
||||
String[] tokens = ngrams.split(" ");
|
||||
for (String token : tokens) {
|
||||
Query ngramQuery = new TermQuery(
|
||||
new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
|
||||
);
|
||||
bq.add(ngramQuery, Occur.MUST);
|
||||
}
|
||||
}
|
||||
return bq.build();
|
||||
}
|
||||
}
|
||||
|
||||
public void testRangeAcceleration() throws IOException, ParseException {
|
||||
|
||||
RangeTest[] tests = {
|
||||
new RangeTest("c:/a.txt", "c:/z.txt", "_c: c:/"),
|
||||
new RangeTest("C:/ProgramFiles/a.txt", "C:/ProgramFiles/z.txt", "_c: :/p pro ogr ram mfi ile es/"),
|
||||
};
|
||||
for (RangeTest test : tests) {
|
||||
Query wildcardFieldQuery = test.getRangeQuery();
|
||||
testExpectedAccelerationQuery(test.lower + "-" + test.upper, wildcardFieldQuery, test.getExpectedApproxQuery());
|
||||
}
|
||||
}
|
||||
|
||||
void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {
|
||||
|
||||
QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
|
||||
|
@ -531,6 +658,33 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
|||
return randomValue;
|
||||
}
|
||||
|
||||
private TermRangeQuery getRandomRange(HashSet<String> values) {
|
||||
// Pick one of the indexed document values to focus our queries on.
|
||||
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
|
||||
StringBuilder upper = new StringBuilder();
|
||||
//Pick a part of the string to change
|
||||
int substitutionPoint = randomIntBetween(0, randomValue.length()-1);
|
||||
int substitutionLength = randomIntBetween(1, Math.min(10, randomValue.length() - substitutionPoint));
|
||||
|
||||
//Add any head to the result, unchanged
|
||||
if(substitutionPoint >0) {
|
||||
upper.append(randomValue.substring(0,substitutionPoint));
|
||||
}
|
||||
|
||||
// Modify the middle...
|
||||
String replacementPart = randomValue.substring(substitutionPoint, substitutionPoint+substitutionLength);
|
||||
// .-replace all a chars with z
|
||||
upper.append(replacementPart.replaceAll("a", "z"));
|
||||
|
||||
//add any remaining tail, unchanged
|
||||
if(substitutionPoint + substitutionLength <= randomValue.length()-1) {
|
||||
upper.append(randomValue.substring(substitutionPoint + substitutionLength));
|
||||
}
|
||||
return new TermRangeQuery(WILDCARD_FIELD_NAME, new BytesRef(randomValue), new BytesRef(upper.toString()),
|
||||
randomBoolean(), randomBoolean());
|
||||
}
|
||||
|
||||
|
||||
private String getRandomRegexPattern(HashSet<String> values) {
|
||||
// Pick one of the indexed document values to focus our queries on.
|
||||
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
|
||||
|
|
Loading…
Reference in New Issue