Fix query analyzer logic for mixed conjunctions of terms and ranges (#49803)

When the query analyzer examines a conjunction containing both terms and ranges,
it should only include ranges in the minimum_should_match calculation if there are no
other range queries on that same field within the conjunction. This is because we cannot
build a selection query over disjoint ranges on the same field, and it is not easy to check
if two range queries have an overlap.

The current logic to calculate this just sets minimum_should_match to 1 or 0, dependent
on whether or not the current range is over a field that has already been seen. However, this
can be incorrect in the case that there are terms in the same match group which adjust the
minimum_should_match downwards. Instead, the logic should be changed to match the
terms extraction, whereby we adjust minimum_should_match downwards if we have already
seen a range field.

Fixes #49684
This commit is contained in:
Alan Woodward 2019-12-10 10:44:31 +00:00
parent a16abf921f
commit 3d8c2f9e18
2 changed files with 172 additions and 31 deletions

View File

@ -247,7 +247,7 @@ final class QueryAnalyzer {
List<Result> conjunctions = conjunctionsWithUnknowns.stream().filter(r -> r.isUnknown() == false).collect(Collectors.toList()); List<Result> conjunctions = conjunctionsWithUnknowns.stream().filter(r -> r.isUnknown() == false).collect(Collectors.toList());
if (conjunctions.isEmpty()) { if (conjunctions.isEmpty()) {
if (conjunctionsWithUnknowns.isEmpty()) { if (conjunctionsWithUnknowns.isEmpty()) {
throw new IllegalArgumentException("Must have at least on conjunction sub result"); throw new IllegalArgumentException("Must have at least one conjunction sub result");
} }
return conjunctionsWithUnknowns.get(0); // all conjunctions are unknown, so just return the first one return conjunctionsWithUnknowns.get(0); // all conjunctions are unknown, so just return the first one
} }
@ -260,64 +260,73 @@ final class QueryAnalyzer {
return subResult; return subResult;
} }
} }
int msm = 0; int msm = 0;
boolean verified = conjunctionsWithUnknowns.size() == conjunctions.size(); boolean verified = conjunctionsWithUnknowns.size() == conjunctions.size();
boolean matchAllDocs = true; boolean matchAllDocs = true;
boolean hasDuplicateTerms = false;
Set<QueryExtraction> extractions = new HashSet<>(); Set<QueryExtraction> extractions = new HashSet<>();
Set<String> seenRangeFields = new HashSet<>(); Set<String> seenRangeFields = new HashSet<>();
for (Result result : conjunctions) { for (Result result : conjunctions) {
// In case that there are duplicate query extractions we need to be careful with
// incrementing msm,
// because that could lead to valid matches not becoming candidate matches:
// query: (field:val1 AND field:val2) AND (field:val2 AND field:val3)
// doc: field: val1 val2 val3
// So lets be protective and decrease the msm:
int resultMsm = result.minimumShouldMatch; int resultMsm = result.minimumShouldMatch;
for (QueryExtraction queryExtraction : result.extractions) { for (QueryExtraction queryExtraction : result.extractions) {
if (queryExtraction.range != null) { if (queryExtraction.range != null) {
// In case of range queries each extraction does not simply increment the // In case of range queries each extraction does not simply increment the
// minimum_should_match // minimum_should_match for that percolator query like for a term based extraction,
// for that percolator query like for a term based extraction, so that can lead // so that can lead to more false positives for percolator queries with range queries
// to more false // than term based queries.
// positives for percolator queries with range queries than term based queries. // This is because the way number fields are extracted from the document to be
// The is because the way number fields are extracted from the document to be // percolated. Per field a single range is extracted and if a percolator query has two or
// percolated. // more range queries on the same field, then the minimum should match can be higher than clauses
// Per field a single range is extracted and if a percolator query has two or // in the CoveringQuery. Therefore right now the minimum should match is only incremented once per
// more range queries // number field when processing the percolator query at index time.
// on the same field, then the minimum should match can be higher than clauses // For multiple ranges within a single extraction (ie from an existing conjunction or disjunction)
// in the CoveringQuery. // then this will already have been taken care of, so we only check against fieldnames from
// Therefore right now the minimum should match is incremented once per number // previously processed extractions, and don't add to the seenRangeFields list until all
// field when processing // extractions from this result are processed
// the percolator query at index time. if (seenRangeFields.contains(queryExtraction.range.fieldName)) {
if (seenRangeFields.add(queryExtraction.range.fieldName)) { resultMsm = Math.max(0, resultMsm - 1);
resultMsm = 1;
} else {
resultMsm = 0;
verified = false; verified = false;
break;
}
} }
} else {
// In case that there are duplicate term query extractions we need to be careful with
// incrementing msm, because that could lead to valid matches not becoming candidate matches:
// query: (field:val1 AND field:val2) AND (field:val2 AND field:val3)
// doc: field: val1 val2 val3
// So lets be protective and decrease the msm:
if (extractions.contains(queryExtraction)) { if (extractions.contains(queryExtraction)) {
resultMsm = Math.max(0, resultMsm - 1); resultMsm = Math.max(0, resultMsm - 1);
verified = false; verified = false;
} }
} }
}
msm += resultMsm; msm += resultMsm;
// add range fields from this Result to the seenRangeFields set so that minimumShouldMatch is correctly
// calculated for subsequent Results
result.extractions.stream()
.map(e -> e.range)
.filter(Objects::nonNull)
.map(e -> e.fieldName)
.forEach(seenRangeFields::add);
if (result.verified == false if (result.verified == false
// If some inner extractions are optional, the result can't be verified // If some inner extractions are optional, the result can't be verified
|| result.minimumShouldMatch < result.extractions.size()) { || result.minimumShouldMatch < result.extractions.size()) {
verified = false; verified = false;
} }
matchAllDocs &= result.matchAllDocs; matchAllDocs &= result.matchAllDocs;
extractions.addAll(result.extractions); extractions.addAll(result.extractions);
} }
if (matchAllDocs) { if (matchAllDocs) {
return new Result(matchAllDocs, verified); return new Result(matchAllDocs, verified);
} else { } else {
return new Result(verified, extractions, hasDuplicateTerms ? 1 : msm); return new Result(verified, extractions, msm);
} }
} else { } else {
Result bestClause = null; Result bestClause = null;
for (Result result : conjunctions) { for (Result result : conjunctions) {

View File

@ -82,6 +82,7 @@ import static org.elasticsearch.percolator.QueryAnalyzer.selectBestResult;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.sameInstance; import static org.hamcrest.Matchers.sameInstance;
import static org.hamcrest.collection.IsCollectionWithSize.hasSize;
public class QueryAnalyzerTests extends ESTestCase { public class QueryAnalyzerTests extends ESTestCase {
@ -1506,4 +1507,135 @@ public class QueryAnalyzerTests extends ESTestCase {
assertTermsEqual(result.extractions, new Term("field", "a")); assertTermsEqual(result.extractions, new Term("field", "a"));
} }
public void testCombinedRangeAndTermWithMinimumShouldMatch() {
Query disj = new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 0, 10), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
.setMinimumNumberShouldMatch(2)
.build();
Result r = analyze(disj, Version.CURRENT);
assertThat(r.minimumShouldMatch, equalTo(1));
assertThat(r.extractions, hasSize(2));
assertFalse(r.matchAllDocs);
assertFalse(r.verified);
Query q = new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 0, 10), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.FILTER)
.setMinimumNumberShouldMatch(2)
.build();
Result result = analyze(q, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(1));
assertThat(result.extractions.size(), equalTo(2));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);
q = new BooleanQuery.Builder()
.add(q, Occur.MUST)
.add(q, Occur.MUST)
.build();
result = analyze(q, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(1));
assertThat(result.extractions.size(), equalTo(2));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);
Query q2 = new BooleanQuery.Builder()
.add(new TermQuery(new Term("f", "v1")), Occur.FILTER)
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v2")), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
.setMinimumNumberShouldMatch(1)
.build();
result = analyze(q2, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(2));
assertThat(result.extractions, hasSize(3));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);
// multiple range queries on different fields
Query q3 = new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD)
.add(IntPoint.newRangeQuery("i2", 15, 20), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
.setMinimumNumberShouldMatch(1)
.build();
result = analyze(q3, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(2));
assertThat(result.extractions, hasSize(4));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);
// multiple disjoint range queries on the same field
Query q4 = new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD)
.add(IntPoint.newRangeQuery("i", 25, 30), Occur.SHOULD)
.add(IntPoint.newRangeQuery("i", 35, 40), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
.setMinimumNumberShouldMatch(1)
.build();
result = analyze(q4, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(2));
assertThat(result.extractions, hasSize(5));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);
// multiple conjunction range queries on the same field
Query q5 = new BooleanQuery.Builder()
.add(new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.MUST)
.add(IntPoint.newRangeQuery("i", 25, 30), Occur.MUST)
.build(), Occur.MUST)
.add(IntPoint.newRangeQuery("i", 35, 40), Occur.MUST)
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
.build();
result = analyze(q5, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(2));
assertThat(result.extractions, hasSize(4));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);
// multiple conjunction range queries on different fields
Query q6 = new BooleanQuery.Builder()
.add(new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.MUST)
.add(IntPoint.newRangeQuery("i2", 25, 30), Occur.MUST)
.build(), Occur.MUST)
.add(IntPoint.newRangeQuery("i", 35, 40), Occur.MUST)
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
.build();
result = analyze(q6, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(3));
assertThat(result.extractions, hasSize(4));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);
// mixed term and range conjunctions
Query q7 = new BooleanQuery.Builder()
.add(new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 1, 2), Occur.MUST)
.add(new TermQuery(new Term("f", "1")), Occur.MUST)
.build(), Occur.MUST)
.add(new BooleanQuery.Builder()
.add(IntPoint.newRangeQuery("i", 1, 2), Occur.MUST)
.add(new TermQuery(new Term("f", "2")), Occur.MUST)
.build(), Occur.MUST)
.build();
result = analyze(q7, Version.CURRENT);
assertThat(result.minimumShouldMatch, equalTo(3));
assertThat(result.extractions, hasSize(3));
assertFalse(result.verified);
assertFalse(result.matchAllDocs);
}
} }