Backport to add range query support to wildcard field Closes #57816
This commit is contained in:
parent
db03e7c93b
commit
2da8e57f59
|
@ -30,15 +30,19 @@ import org.apache.lucene.search.PrefixQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.SortField;
|
import org.apache.lucene.search.SortField;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.search.TermRangeQuery;
|
||||||
import org.apache.lucene.search.WildcardQuery;
|
import org.apache.lucene.search.WildcardQuery;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.automaton.Automaton;
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
import org.apache.lucene.util.automaton.RegExp;
|
import org.apache.lucene.util.automaton.RegExp;
|
||||||
import org.apache.lucene.util.automaton.RegExp.Kind;
|
import org.apache.lucene.util.automaton.RegExp.Kind;
|
||||||
import org.elasticsearch.ElasticsearchException;
|
import org.elasticsearch.ElasticsearchException;
|
||||||
import org.elasticsearch.ElasticsearchParseException;
|
import org.elasticsearch.ElasticsearchParseException;
|
||||||
|
import org.elasticsearch.common.geo.ShapeRelation;
|
||||||
import org.elasticsearch.common.lucene.BytesRefs;
|
import org.elasticsearch.common.lucene.BytesRefs;
|
||||||
import org.elasticsearch.common.lucene.Lucene;
|
import org.elasticsearch.common.lucene.Lucene;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.common.time.DateMathParser;
|
||||||
import org.elasticsearch.common.unit.Fuzziness;
|
import org.elasticsearch.common.unit.Fuzziness;
|
||||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||||
import org.elasticsearch.common.xcontent.XContentParser;
|
import org.elasticsearch.common.xcontent.XContentParser;
|
||||||
|
@ -70,6 +74,7 @@ import org.elasticsearch.search.aggregations.support.ValuesSourceType;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.time.ZoneId;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
@ -614,6 +619,12 @@ public class WildcardFieldMapper extends FieldMapper {
|
||||||
return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
|
return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected String firstNgramToken(String fragment) {
|
||||||
|
LinkedHashSet<String> tokens = new LinkedHashSet<>();
|
||||||
|
getNgramTokens(tokens, fragment);
|
||||||
|
return tokens.iterator().next();
|
||||||
|
}
|
||||||
|
|
||||||
protected void getNgramTokens(Set<String> tokens, String fragment) {
|
protected void getNgramTokens(Set<String> tokens, String fragment) {
|
||||||
if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
|
if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
|
||||||
// If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search
|
// If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search
|
||||||
|
@ -678,6 +689,90 @@ public class WildcardFieldMapper extends FieldMapper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Query rangeQuery(
|
||||||
|
Object lowerTerm,
|
||||||
|
Object upperTerm,
|
||||||
|
boolean includeLower,
|
||||||
|
boolean includeUpper,
|
||||||
|
ShapeRelation relation,
|
||||||
|
ZoneId timeZone,
|
||||||
|
DateMathParser parser,
|
||||||
|
QueryShardContext context
|
||||||
|
) {
|
||||||
|
if (context.allowExpensiveQueries() == false) {
|
||||||
|
throw new ElasticsearchException("[range] queries on [wildcard] fields cannot be executed when '" +
|
||||||
|
ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false.");
|
||||||
|
}
|
||||||
|
BytesRef lower = lowerTerm == null ? null : BytesRefs.toBytesRef(lowerTerm);
|
||||||
|
BytesRef upper = upperTerm == null ? null : BytesRefs.toBytesRef(upperTerm);
|
||||||
|
Query accelerationQuery = null;
|
||||||
|
if (lowerTerm != null && upperTerm != null) {
|
||||||
|
// Long common prefixes e.g. "C:/Program Files/a,txt" to "C:/Program Files/z,txt"
|
||||||
|
// can be accelerated by searching for all the common leading ngrams e.g. c:/, /pr, rog, gra etc
|
||||||
|
StringBuilder commonPrefix = new StringBuilder();
|
||||||
|
String lowerS = addLineEndChars(toLowerCase(lower.utf8ToString()));
|
||||||
|
String upperS = addLineEndChars(toLowerCase(upper.utf8ToString()));
|
||||||
|
for (int i = 0; i < Math.min(lowerS.length(), upperS.length());) {
|
||||||
|
final int cL = lowerS.codePointAt(i);
|
||||||
|
final int cU = upperS.codePointAt(i);
|
||||||
|
if (cL == cU) {
|
||||||
|
commonPrefix.append(Character.toChars(cL));
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
int length = Character.charCount(cL);
|
||||||
|
i += length;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (commonPrefix.length() > 0) {
|
||||||
|
Set<String> tokens = new HashSet<>();
|
||||||
|
getNgramTokens(tokens, commonPrefix.toString());
|
||||||
|
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
|
||||||
|
for (String token : tokens) {
|
||||||
|
int tokenSize = token.codePointCount(0, token.length());
|
||||||
|
if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tokenSize == NGRAM_SIZE) {
|
||||||
|
TermQuery tq = new TermQuery(new Term(name(), token));
|
||||||
|
bqBuilder.add(new BooleanClause(tq, Occur.MUST));
|
||||||
|
} else {
|
||||||
|
PrefixQuery wq = new PrefixQuery(new Term(name(), token));
|
||||||
|
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
|
||||||
|
bqBuilder.add(new BooleanClause(wq, Occur.MUST));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
BooleanQuery bq = bqBuilder.build();
|
||||||
|
if (bq.clauses().size() > 0) {
|
||||||
|
accelerationQuery = bq;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (accelerationQuery == null) {
|
||||||
|
// Fallback - if there is no common prefix sequence then we look for the range of ngrams that appear at the start
|
||||||
|
// of the string e.g. given 100 to 999 we would search for ngrams in the range
|
||||||
|
// TOKEN_START_OR_END_CHAR + "10" to
|
||||||
|
// TOKEN_START_OR_END_CHAR + "99"
|
||||||
|
BytesRef lowerNgram = lower == null ? null : new BytesRef(firstNgramToken(
|
||||||
|
addLineEndChars(toLowerCase(lower.utf8ToString()))));
|
||||||
|
BytesRef upperNgram = upper == null ? null : new BytesRef(firstNgramToken(
|
||||||
|
addLineEndChars(toLowerCase(upper.utf8ToString()))));
|
||||||
|
accelerationQuery = new TermRangeQuery(name(), lowerNgram, upperNgram, true, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
Supplier <Automaton> deferredAutomatonSupplier = ()->{
|
||||||
|
return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
|
||||||
|
};
|
||||||
|
AutomatonQueryOnBinaryDv slowQuery = new AutomatonQueryOnBinaryDv(name(), lower + "-" + upper, deferredAutomatonSupplier);
|
||||||
|
|
||||||
|
BooleanQuery.Builder qBuilder = new BooleanQuery.Builder();
|
||||||
|
qBuilder.add(accelerationQuery, Occur.MUST);
|
||||||
|
qBuilder.add(slowQuery, Occur.MUST);
|
||||||
|
return qBuilder.build();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Query fuzzyQuery(
|
public Query fuzzyQuery(
|
||||||
Object value,
|
Object value,
|
||||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.search.Sort;
|
import org.apache.lucene.search.Sort;
|
||||||
import org.apache.lucene.search.SortField;
|
import org.apache.lucene.search.SortField;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.search.TermRangeQuery;
|
||||||
import org.apache.lucene.search.TopDocs;
|
import org.apache.lucene.search.TopDocs;
|
||||||
import org.apache.lucene.search.WildcardQuery;
|
import org.apache.lucene.search.WildcardQuery;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
@ -214,7 +215,7 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
||||||
Query wildcardFieldQuery = null;
|
Query wildcardFieldQuery = null;
|
||||||
Query keywordFieldQuery = null;
|
Query keywordFieldQuery = null;
|
||||||
String pattern = null;
|
String pattern = null;
|
||||||
switch (randomInt(3)) {
|
switch (randomInt(4)) {
|
||||||
case 0:
|
case 0:
|
||||||
pattern = getRandomWildcardPattern();
|
pattern = getRandomWildcardPattern();
|
||||||
wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
|
||||||
|
@ -259,6 +260,14 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
||||||
keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50,
|
keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50,
|
||||||
transpositions, MOCK_QSC);
|
transpositions, MOCK_QSC);
|
||||||
break;
|
break;
|
||||||
|
case 4:
|
||||||
|
TermRangeQuery trq = getRandomRange(values);
|
||||||
|
wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
|
||||||
|
trq.includesUpper(), null, null, null, MOCK_QSC);
|
||||||
|
keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
|
||||||
|
trq.includesUpper(), null, null, null, MOCK_QSC);
|
||||||
|
break;
|
||||||
|
|
||||||
}
|
}
|
||||||
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE);
|
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE);
|
||||||
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE);
|
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE);
|
||||||
|
@ -294,6 +303,76 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void indexDoc(RandomIndexWriter iw, String value) throws IOException {
|
||||||
|
Document doc = new Document();
|
||||||
|
ParseContext.Document parseDoc = new ParseContext.Document();
|
||||||
|
addFields(parseDoc, doc, value);
|
||||||
|
indexDoc(parseDoc, doc, iw);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRangeQueryVersusKeywordField() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
|
||||||
|
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||||
|
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||||
|
|
||||||
|
// Tests for acceleration strategy based on long common prefix
|
||||||
|
indexDoc(iw, "C:\\Program Files\\a.txt");
|
||||||
|
indexDoc(iw, "C:\\Program Files\\n.txt");
|
||||||
|
indexDoc(iw, "C:\\Program Files\\z.txt");
|
||||||
|
|
||||||
|
// Tests for acceleration strategy based on no common prefix
|
||||||
|
indexDoc(iw, "a.txt");
|
||||||
|
indexDoc(iw, "n.txt");
|
||||||
|
indexDoc(iw, "z.txt");
|
||||||
|
|
||||||
|
iw.forceMerge(1);
|
||||||
|
DirectoryReader reader = iw.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
iw.close();
|
||||||
|
|
||||||
|
|
||||||
|
String [][] rangeTests = {
|
||||||
|
{"C:\\Program Files\\a", "C:\\Program Files\\z"},
|
||||||
|
{"C:\\Program Files\\a", "C:\\Program Files\\n"},
|
||||||
|
{null, "C:\\Program Files\\z"},
|
||||||
|
{"C:\\Program Files\\a", null},
|
||||||
|
|
||||||
|
{"a.txt", "z.txt"},
|
||||||
|
{"a.txt", "n.txt"},
|
||||||
|
{null, "z.txt"},
|
||||||
|
{"a.txt", null}
|
||||||
|
};
|
||||||
|
|
||||||
|
for (String[] bounds : rangeTests) {
|
||||||
|
BytesRef lower = bounds[0] == null ? null :new BytesRef(bounds[0]);
|
||||||
|
BytesRef upper = bounds[1] == null ? null :new BytesRef(bounds[1]);
|
||||||
|
TermRangeQuery trq = new TermRangeQuery(WILDCARD_FIELD_NAME, lower, upper, randomBoolean(), randomBoolean());
|
||||||
|
Query wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
|
||||||
|
trq.includesUpper(), null, null, null, MOCK_QSC);
|
||||||
|
Query keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
|
||||||
|
trq.includesUpper(), null, null, null, MOCK_QSC);
|
||||||
|
|
||||||
|
|
||||||
|
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, 10, Sort.RELEVANCE);
|
||||||
|
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.RELEVANCE);
|
||||||
|
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value));
|
||||||
|
|
||||||
|
HashSet<Integer> expectedDocs = new HashSet<>();
|
||||||
|
for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
|
||||||
|
expectedDocs.add(topDoc.doc);
|
||||||
|
}
|
||||||
|
for (ScoreDoc wcTopDoc : wildcardFieldTopDocs.scoreDocs) {
|
||||||
|
assertTrue(expectedDocs.remove(wcTopDoc.doc));
|
||||||
|
}
|
||||||
|
assertThat(expectedDocs.size(), equalTo(0));
|
||||||
|
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void testRegexAcceleration() throws IOException, ParseException {
|
public void testRegexAcceleration() throws IOException, ParseException {
|
||||||
// All these expressions should rewrite to a match all with no verification step required at all
|
// All these expressions should rewrite to a match all with no verification step required at all
|
||||||
String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"};
|
String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"};
|
||||||
|
@ -485,6 +564,54 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static class RangeTest {
|
||||||
|
String lower;
|
||||||
|
String upper;
|
||||||
|
String ngrams;
|
||||||
|
|
||||||
|
RangeTest(
|
||||||
|
String lower,
|
||||||
|
String upper,
|
||||||
|
String ngrams
|
||||||
|
) {
|
||||||
|
super();
|
||||||
|
this.lower = lower;
|
||||||
|
this.upper = upper;
|
||||||
|
this.ngrams = ngrams;
|
||||||
|
}
|
||||||
|
|
||||||
|
Query getRangeQuery() {
|
||||||
|
return wildcardFieldType.fieldType().rangeQuery(lower, upper, true, true, null, null, null, MOCK_QSC);
|
||||||
|
}
|
||||||
|
|
||||||
|
Query getExpectedApproxQuery() throws ParseException {
|
||||||
|
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||||
|
if (ngrams != null) {
|
||||||
|
String[] tokens = ngrams.split(" ");
|
||||||
|
for (String token : tokens) {
|
||||||
|
Query ngramQuery = new TermQuery(
|
||||||
|
new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
|
||||||
|
);
|
||||||
|
bq.add(ngramQuery, Occur.MUST);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return bq.build();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRangeAcceleration() throws IOException, ParseException {
|
||||||
|
|
||||||
|
RangeTest[] tests = {
|
||||||
|
new RangeTest("c:/a.txt", "c:/z.txt", "_c: c:/"),
|
||||||
|
new RangeTest("C:/ProgramFiles/a.txt", "C:/ProgramFiles/z.txt", "_c: :/p pro ogr ram mfi ile es/"),
|
||||||
|
};
|
||||||
|
for (RangeTest test : tests) {
|
||||||
|
Query wildcardFieldQuery = test.getRangeQuery();
|
||||||
|
testExpectedAccelerationQuery(test.lower + "-" + test.upper, wildcardFieldQuery, test.getExpectedApproxQuery());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {
|
void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {
|
||||||
|
|
||||||
QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
|
QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
|
||||||
|
@ -531,6 +658,33 @@ public class WildcardFieldMapperTests extends ESTestCase {
|
||||||
return randomValue;
|
return randomValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private TermRangeQuery getRandomRange(HashSet<String> values) {
|
||||||
|
// Pick one of the indexed document values to focus our queries on.
|
||||||
|
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
|
||||||
|
StringBuilder upper = new StringBuilder();
|
||||||
|
//Pick a part of the string to change
|
||||||
|
int substitutionPoint = randomIntBetween(0, randomValue.length()-1);
|
||||||
|
int substitutionLength = randomIntBetween(1, Math.min(10, randomValue.length() - substitutionPoint));
|
||||||
|
|
||||||
|
//Add any head to the result, unchanged
|
||||||
|
if(substitutionPoint >0) {
|
||||||
|
upper.append(randomValue.substring(0,substitutionPoint));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Modify the middle...
|
||||||
|
String replacementPart = randomValue.substring(substitutionPoint, substitutionPoint+substitutionLength);
|
||||||
|
// .-replace all a chars with z
|
||||||
|
upper.append(replacementPart.replaceAll("a", "z"));
|
||||||
|
|
||||||
|
//add any remaining tail, unchanged
|
||||||
|
if(substitutionPoint + substitutionLength <= randomValue.length()-1) {
|
||||||
|
upper.append(randomValue.substring(substitutionPoint + substitutionLength));
|
||||||
|
}
|
||||||
|
return new TermRangeQuery(WILDCARD_FIELD_NAME, new BytesRef(randomValue), new BytesRef(upper.toString()),
|
||||||
|
randomBoolean(), randomBoolean());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private String getRandomRegexPattern(HashSet<String> values) {
|
private String getRandomRegexPattern(HashSet<String> values) {
|
||||||
// Pick one of the indexed document values to focus our queries on.
|
// Pick one of the indexed document values to focus our queries on.
|
||||||
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
|
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
|
||||||
|
|
Loading…
Reference in New Issue