Fix more query extraction bugs. (#29388)

I found the following bugs: - The 6.0 logic for conjunctions didn't work when there were only `match_all` queries in MUST/FILTER clauses as they didn't propagate the `matchAllDocs` flag. - Some queries still had the same issue as `BooleanQuery` used to have with duplicate terms (see #28353), eg. `MultiPhraseQuery`. Closes #29376
2018-04-06 10:44:34 +02:00 · 2018-04-06 10:44:34 +02:00 · 85f5382a3c
parent ae2a9f7108
commit 85f5382a3c
3 changed files with 205 additions and 100 deletions
--- a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java
+++ b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java
@ -38,6 +38,7 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SynonymQuery;
 import org.apache.lucene.search.TermInSetQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.spans.SpanFirstQuery;
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanNotQuery;
@ -235,20 +236,18 @@ final class QueryAnalyzer {
                return new Result(true, Collections.emptySet(), 0);
            }
-            if (version.onOrAfter(Version.V_6_1_0)) {
+            // This query has the same problem as boolean queries when it comes to duplicated terms
-                Set<QueryExtraction> extractions = new HashSet<>();
+            // So to keep things simple, we just rewrite to a boolean query
            BooleanQuery.Builder builder = new BooleanQuery.Builder();
            for (Term[] termArr : terms) {
-                    extractions.addAll(Arrays.stream(termArr).map(QueryExtraction::new).collect(toSet()));
+                BooleanQuery.Builder subBuilder = new BooleanQuery.Builder();
                for (Term term : termArr) {
                    subBuilder.add(new TermQuery(term), Occur.SHOULD);
                }
-                return new Result(false, extractions, terms.length);
+                builder.add(subBuilder.build(), Occur.FILTER);
            } else {
                Set<QueryExtraction> bestTermArr = null;
                for (Term[] termArr : terms) {
                    Set<QueryExtraction> queryExtractions = Arrays.stream(termArr).map(QueryExtraction::new).collect(toSet());
                    bestTermArr = selectBestExtraction(bestTermArr, queryExtractions);
                }
                return new Result(false, bestTermArr, 1);
            }
            // Make sure to unverify the result
            return booleanQuery().apply(builder.build(), version).unverify();
        };
    }
@ -263,41 +262,35 @@ final class QueryAnalyzer {
        return (query, version) -> {
            SpanNearQuery spanNearQuery = (SpanNearQuery) query;
            if (version.onOrAfter(Version.V_6_1_0)) {
-                Set<Result> results = Arrays.stream(spanNearQuery.getClauses()).map(clause -> analyze(clause, version)).collect(toSet());
+                // This has the same problem as boolean queries when it comes to duplicated clauses
-                int msm = 0;
+                // so we rewrite to a boolean query to keep things simple.
-                Set<QueryExtraction> extractions = new HashSet<>();
+                BooleanQuery.Builder builder = new BooleanQuery.Builder();
-                Set<String> seenRangeFields = new HashSet<>();
+                for (SpanQuery clause : spanNearQuery.getClauses()) {
-                for (Result result : results) {
+                    builder.add(clause, Occur.FILTER);
                    QueryExtraction[] t = result.extractions.toArray(new QueryExtraction[1]);
                    if (result.extractions.size() == 1 && t[0].range != null) {
                        if (seenRangeFields.add(t[0].range.fieldName)) {
                            msm += 1;
                }
                // make sure to unverify the result
                return booleanQuery().apply(builder.build(), version).unverify();
            } else {
-                        msm += result.minimumShouldMatch;
+                Result bestClause = null;
                    }
                    extractions.addAll(result.extractions);
                }
                return new Result(false, extractions, msm);
            } else {
                Set<QueryExtraction> bestClauses = null;
                for (SpanQuery clause : spanNearQuery.getClauses()) {
                    Result temp = analyze(clause, version);
-                    bestClauses = selectBestExtraction(temp.extractions, bestClauses);
+                    bestClause = selectBestResult(temp, bestClause);
                }
-                return new Result(false, bestClauses, 1);
+                return bestClause;
            }
        };
    }
    private static BiFunction<Query, Version, Result> spanOrQuery() {
        return (query, version) -> {
            Set<QueryExtraction> terms = new HashSet<>();
            SpanOrQuery spanOrQuery = (SpanOrQuery) query;
            // handle it like a boolean query to not dulplicate eg. logic
            // about duplicated terms
            BooleanQuery.Builder builder = new BooleanQuery.Builder();
            for (SpanQuery clause : spanOrQuery.getClauses()) {
-                terms.addAll(analyze(clause, version).extractions);
+                builder.add(clause, Occur.SHOULD);
            }
-            return new Result(false, terms, Math.min(1, terms.size()));
+            return booleanQuery().apply(builder.build(), version);
        };
    }
@ -423,9 +416,13 @@ final class QueryAnalyzer {
                        }
                    }
                } else {
-                    Set<QueryExtraction> bestClause = null;
+                    Result bestClause = null;
                    UnsupportedQueryException uqe = null;
                    boolean hasProhibitedClauses = false;
                    for (BooleanClause clause : clauses) {
                        if (clause.isProhibited()) {
                            hasProhibitedClauses = true;
                        }
                        if (clause.isRequired() == false) {
                            // skip must_not clauses, we don't need to remember the things that do *not* match...
                            // skip should clauses, this bq has must clauses, so we don't need to remember should clauses,
@ -440,17 +437,20 @@ final class QueryAnalyzer {
                            uqe = e;
                            continue;
                        }
-                        bestClause = selectBestExtraction(temp.extractions, bestClause);
+                        bestClause = selectBestResult(temp, bestClause);
                    }
                    if (bestClause != null) {
-                        return new Result(false, bestClause, 1);
+                        if (hasProhibitedClauses || minimumShouldMatch > 0) {
                            bestClause = bestClause.unverify();
                        }
                        return bestClause;
                    } else {
                        if (uqe != null) {
                            // we're unable to select the best clause and an exception occurred, so we bail
                            throw uqe;
                        } else {
                            // We didn't find a clause and no exception occurred, so this bq only contained MatchNoDocsQueries,
-                            return new Result(true, Collections.emptySet(), 1);
+                            return new Result(true, Collections.emptySet(), 0);
                        }
                    }
                }
@ -616,22 +616,40 @@ final class QueryAnalyzer {
        }
    }
-    static Set<QueryExtraction> selectBestExtraction(Set<QueryExtraction> extractions1, Set<QueryExtraction> extractions2) {
+    /**
-        assert extractions1 != null || extractions2 != null;
+     * Return an extraction for the conjunction of {@code result1} and {@code result2}
-        if (extractions1 == null) {
+     * by picking up clauses that look most restrictive and making it unverified if
-            return extractions2;
+     * the other clause is not null and doesn't match all documents. This is used by
-        } else if (extractions2 == null) {
+     * 6.0.0 indices which didn't use the terms_set query.
-            return extractions1;
+     */
    static Result selectBestResult(Result result1, Result result2) {
        assert result1 != null || result2 != null;
        if (result1 == null) {
            return result2;
        } else if (result2 == null) {
            return result1;
        } else if (result1.matchAllDocs) { // conjunction with match_all
            Result result = result2;
            if (result1.verified == false) {
                result = result.unverify();
            }
            return result;
        } else if (result2.matchAllDocs) { // conjunction with match_all
            Result result = result1;
            if (result2.verified == false) {
                result = result.unverify();
            }
            return result;
        } else {
            // Prefer term based extractions over range based extractions:
            boolean onlyRangeBasedExtractions = true;
-            for (QueryExtraction clause : extractions1) {
+            for (QueryExtraction clause : result1.extractions) {
                if (clause.term != null) {
                    onlyRangeBasedExtractions = false;
                    break;
                }
            }
-            for (QueryExtraction clause : extractions2) {
+            for (QueryExtraction clause : result2.extractions) {
                if (clause.term != null) {
                    onlyRangeBasedExtractions = false;
                    break;
@ -639,28 +657,28 @@ final class QueryAnalyzer {
            }
            if (onlyRangeBasedExtractions) {
-                BytesRef extraction1SmallestRange = smallestRange(extractions1);
+                BytesRef extraction1SmallestRange = smallestRange(result1.extractions);
-                BytesRef extraction2SmallestRange = smallestRange(extractions2);
+                BytesRef extraction2SmallestRange = smallestRange(result2.extractions);
                if (extraction1SmallestRange == null) {
-                    return extractions2;
+                    return result2.unverify();
                } else if (extraction2SmallestRange == null) {
-                    return extractions1;
+                    return result1.unverify();
                }
                // Keep the clause with smallest range, this is likely to be the rarest.
                if (extraction1SmallestRange.compareTo(extraction2SmallestRange) <= 0) {
-                    return extractions1;
+                    return result1.unverify();
                } else {
-                    return extractions2;
+                    return result2.unverify();
                }
            } else {
-                int extraction1ShortestTerm = minTermLength(extractions1);
+                int extraction1ShortestTerm = minTermLength(result1.extractions);
-                int extraction2ShortestTerm = minTermLength(extractions2);
+                int extraction2ShortestTerm = minTermLength(result2.extractions);
                // keep the clause with longest terms, this likely to be rarest.
                if (extraction1ShortestTerm >= extraction2ShortestTerm) {
-                    return extractions1;
+                    return result1.unverify();
                } else {
-                    return extractions2;
+                    return result2.unverify();
                }
            }
        }
@ -695,6 +713,13 @@ final class QueryAnalyzer {
        return min;
    }
    /**
     * Query extraction result. A result is a candidate for a given document either if:
     *  - `matchAllDocs` is true
     *  - `extractions` and the document have `minimumShouldMatch` terms in common
     *  Further more, the match doesn't need to be verified if `verified` is true, checking
     *  `matchAllDocs` and `extractions` is enough.
     */
    static class Result {
        final Set<QueryExtraction> extractions;
@ -702,24 +727,32 @@ final class QueryAnalyzer {
        final int minimumShouldMatch;
        final boolean matchAllDocs;
-        Result(boolean verified, Set<QueryExtraction> extractions, int minimumShouldMatch) {
+        private Result(boolean matchAllDocs, boolean verified, Set<QueryExtraction> extractions, int minimumShouldMatch) {
            if (minimumShouldMatch > extractions.size()) {
                throw new IllegalArgumentException("minimumShouldMatch can't be greater than the number of extractions: "
                        + minimumShouldMatch + " > " + extractions.size());
            }
            this.matchAllDocs = matchAllDocs;
            this.extractions = extractions;
            this.verified = verified;
            this.minimumShouldMatch = minimumShouldMatch;
-            this.matchAllDocs = false;
+        }
        Result(boolean verified, Set<QueryExtraction> extractions, int minimumShouldMatch) {
            this(false, verified, extractions, minimumShouldMatch);
        }
        Result(boolean matchAllDocs, boolean verified) {
-            this.extractions = Collections.emptySet();
+            this(matchAllDocs, verified, Collections.emptySet(), 0);
            this.verified = verified;
            this.minimumShouldMatch = 0;
            this.matchAllDocs = matchAllDocs;
        }
        Result unverify() {
            if (verified) {
                return new Result(matchAllDocs, false, extractions, minimumShouldMatch);
            } else {
                return this;
            }
        }
    }
    static class QueryExtraction {
--- a/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java
+++ b/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java
@ -74,7 +74,7 @@ import java.util.stream.Collectors;
 import static org.elasticsearch.percolator.QueryAnalyzer.UnsupportedQueryException;
 import static org.elasticsearch.percolator.QueryAnalyzer.analyze;
-import static org.elasticsearch.percolator.QueryAnalyzer.selectBestExtraction;
+import static org.elasticsearch.percolator.QueryAnalyzer.selectBestResult;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.is;
 import static org.hamcrest.Matchers.sameInstance;
@ -163,6 +163,20 @@ public class QueryAnalyzerTests extends ESTestCase {
        assertThat(terms.get(0).bytes().utf8ToString(), equalTo("_very_long_term"));
    }
    public void testExtractQueryMetadata_multiPhraseQuery_dups() {
        MultiPhraseQuery multiPhraseQuery = new MultiPhraseQuery.Builder()
            .add(new Term("_field", "_term1"))
            .add(new Term[] {new Term("_field", "_term1"), new Term("_field", "_term2")})
            .build();
        Result result = analyze(multiPhraseQuery, Version.CURRENT);
        assertFalse(result.matchAllDocs);
        assertFalse(result.verified);
        assertTermsEqual(result.extractions, new Term("_field", "_term1"), new Term("_field", "_term2"));
        assertEquals(1, result.minimumShouldMatch); // because of the dup term
    }
    public void testExtractQueryMetadata_booleanQuery() {
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        TermQuery termQuery1 = new TermQuery(new Term("_field", "term0"));
@ -370,18 +384,28 @@ public class QueryAnalyzerTests extends ESTestCase {
        builder.add(termQuery1, BooleanClause.Occur.MUST_NOT);
        PhraseQuery phraseQuery = new PhraseQuery("_field", "_term1", "term2");
        builder.add(phraseQuery, BooleanClause.Occur.SHOULD);
        BooleanQuery booleanQuery = builder.build();
        Result result = analyze(booleanQuery, Version.CURRENT);
        assertThat(result.verified, is(false));
        assertThat(result.minimumShouldMatch, equalTo(2));
-        List<QueryExtraction> terms = new ArrayList<>(result.extractions);
+        assertTermsEqual(result.extractions, phraseQuery.getTerms());
-        assertThat(terms.size(), equalTo(2));
+
-        terms.sort(Comparator.comparing(qt -> qt.term));
+        builder = new BooleanQuery.Builder();
-        assertThat(terms.get(0).field(), equalTo(phraseQuery.getTerms()[0].field()));
+        builder.add(termQuery1, BooleanClause.Occur.MUST_NOT);
-        assertThat(terms.get(0).bytes(), equalTo(phraseQuery.getTerms()[0].bytes()));
+        builder.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST);
-        assertThat(terms.get(1).field(), equalTo(phraseQuery.getTerms()[1].field()));
+        booleanQuery = builder.build();
-        assertThat(terms.get(1).bytes(), equalTo(phraseQuery.getTerms()[1].bytes()));
+        result = analyze(booleanQuery, Version.CURRENT);
        assertThat(result.matchAllDocs, is(true));
        assertThat(result.verified, is(false));
        assertThat(result.minimumShouldMatch, equalTo(0));
        assertTermsEqual(result.extractions);
        result = analyze(booleanQuery, Version.V_6_0_0);
        assertThat(result.matchAllDocs, is(true));
        assertThat(result.verified, is(false));
        assertThat(result.minimumShouldMatch, equalTo(0));
        assertTermsEqual(result.extractions);
    }
    public void testExactMatch_booleanQuery() {
@ -651,7 +675,7 @@ public class QueryAnalyzerTests extends ESTestCase {
        SpanTermQuery spanTermQuery2 = new SpanTermQuery(new Term("_field", "_very_long_term"));
        SpanOrQuery spanOrQuery = new SpanOrQuery(spanTermQuery1, spanTermQuery2);
        Result result = analyze(spanOrQuery, Version.CURRENT);
-        assertThat(result.verified, is(false));
+        assertThat(result.verified, is(true));
        assertThat(result.minimumShouldMatch, equalTo(1));
        assertTermsEqual(result.extractions, spanTermQuery1.getTerm(), spanTermQuery2.getTerm());
    }
@ -943,64 +967,111 @@ public class QueryAnalyzerTests extends ESTestCase {
        assertThat(result.extractions.isEmpty(), is(true));
    }
-    public void testSelectBestExtraction() {
+    public void testSelectBestResult() {
        Set<QueryExtraction> queryTerms1 = terms(new int[0], "12", "1234", "12345");
        Result result1 = new Result(true, queryTerms1, 1);
        Set<QueryAnalyzer.QueryExtraction> queryTerms2 = terms(new int[0], "123", "1234", "12345");
-        Set<QueryExtraction> result = selectBestExtraction(queryTerms1, queryTerms2);
+        Result result2 = new Result(true, queryTerms2, 1);
-        assertSame(queryTerms2, result);
+        Result result = selectBestResult(result1, result2);
        assertSame(queryTerms2, result.extractions);
        assertFalse(result.verified);
        queryTerms1 = terms(new int[]{1, 2, 3});
        result1 = new Result(true, queryTerms1, 1);
        queryTerms2 = terms(new int[]{2, 3, 4});
-        result = selectBestExtraction(queryTerms1, queryTerms2);
+        result2 = new Result(true, queryTerms2, 1);
-        assertSame(queryTerms1, result);
+        result = selectBestResult(result1, result2);
        assertSame(queryTerms1, result.extractions);
        assertFalse(result.verified);
        queryTerms1 = terms(new int[]{4, 5, 6});
        result1 = new Result(true, queryTerms1, 1);
        queryTerms2 = terms(new int[]{1, 2, 3});
-        result = selectBestExtraction(queryTerms1, queryTerms2);
+        result2 = new Result(true, queryTerms2, 1);
-        assertSame(queryTerms2, result);
+        result = selectBestResult(result1, result2);
        assertSame(queryTerms2, result.extractions);
        assertFalse(result.verified);
        queryTerms1 = terms(new int[]{1, 2, 3}, "123", "456");
        result1 = new Result(true, queryTerms1, 1);
        queryTerms2 = terms(new int[]{2, 3, 4}, "123", "456");
-        result = selectBestExtraction(queryTerms1, queryTerms2);
+        result2 = new Result(true, queryTerms2, 1);
-        assertSame(queryTerms1, result);
+        result = selectBestResult(result1, result2);
        assertSame(queryTerms1, result.extractions);
        assertFalse(result.verified);
        queryTerms1 = terms(new int[]{10});
        result1 = new Result(true, queryTerms1, 1);
        queryTerms2 = terms(new int[]{1});
-        result = selectBestExtraction(queryTerms1, queryTerms2);
+        result2 = new Result(true, queryTerms2, 1);
-        assertSame(queryTerms2, result);
+        result = selectBestResult(result1, result2);
        assertSame(queryTerms2, result.extractions);
        queryTerms1 = terms(new int[]{10}, "123");
        result1 = new Result(true, queryTerms1, 1);
        queryTerms2 = terms(new int[]{1});
-        result = selectBestExtraction(queryTerms1, queryTerms2);
+        result2 = new Result(true, queryTerms2, 1);
-        assertSame(queryTerms1, result);
+        result = selectBestResult(result1, result2);
        assertSame(queryTerms1, result.extractions);
        assertFalse(result.verified);
        queryTerms1 = terms(new int[]{10}, "1", "123");
        result1 = new Result(true, queryTerms1, 1);
        queryTerms2 = terms(new int[]{1}, "1", "2");
-        result = selectBestExtraction(queryTerms1, queryTerms2);
+        result2 = new Result(true, queryTerms2, 1);
-        assertSame(queryTerms1, result);
+        result = selectBestResult(result1, result2);
        assertSame(queryTerms1, result.extractions);
        assertFalse(result.verified);
        queryTerms1 = terms(new int[]{1, 2, 3}, "123", "456");
        result1 = new Result(true, queryTerms1, 1);
        queryTerms2 = terms(new int[]{2, 3, 4}, "1", "456");
-        result = selectBestExtraction(queryTerms1, queryTerms2);
+        result2 = new Result(true, queryTerms2, 1);
-        assertSame("Ignoring ranges, so then prefer queryTerms1, because it has the longest shortest term", queryTerms1, result);
+        result = selectBestResult(result1, result2);
        assertSame("Ignoring ranges, so then prefer queryTerms1, because it has the longest shortest term",
                queryTerms1, result.extractions);
        assertFalse(result.verified);
        queryTerms1 = terms(new int[]{});
        result1 = new Result(false, queryTerms1, 0);
        queryTerms2 = terms(new int[]{});
-        result = selectBestExtraction(queryTerms1, queryTerms2);
+        result2 = new Result(false, queryTerms2, 0);
-        assertSame("In case query extractions are empty", queryTerms2, result);
+        result = selectBestResult(result1, result2);
        assertSame("In case query extractions are empty", queryTerms2, result.extractions);
        assertFalse(result.verified);
        queryTerms1 = terms(new int[]{1});
        result1 = new Result(true, queryTerms1, 1);
        queryTerms2 = terms(new int[]{});
-        result = selectBestExtraction(queryTerms1, queryTerms2);
+        result2 = new Result(false, queryTerms2, 0);
-        assertSame("In case query a single extraction is empty", queryTerms1, result);
+        result = selectBestResult(result1, result2);
        assertSame("In case query a single extraction is empty", queryTerms1, result.extractions);
        assertFalse(result.verified);
        queryTerms1 = terms(new int[]{});
        result1 = new Result(false, queryTerms1, 0);
        queryTerms2 = terms(new int[]{1});
-        result = selectBestExtraction(queryTerms1, queryTerms2);
+        result2 = new Result(true, queryTerms2, 1);
-        assertSame("In case query a single extraction is empty", queryTerms2, result);
+        result = selectBestResult(result1, result2);
        assertSame("In case query a single extraction is empty", queryTerms2, result.extractions);
        assertFalse(result.verified);
        result1 = new Result(true, true);
        queryTerms2 = terms(new int[]{1});
        result2 = new Result(true, queryTerms2, 1);
        result = selectBestResult(result1, result2);
        assertSame("Conjunction with a match_all", result2, result);
        assertTrue(result.verified);
        queryTerms1 = terms(new int[]{1});
        result1 = new Result(true, queryTerms2, 1);
        result2 = new Result(true, true);
        result = selectBestResult(result1, result2);
        assertSame("Conjunction with a match_all", result1, result);
        assertTrue(result.verified);
    }
-    public void testSelectBestExtraction_random() {
+    public void testselectBestResult_random() {
        Set<QueryExtraction> terms1 = new HashSet<>();
        int shortestTerms1Length = Integer.MAX_VALUE;
        int sumTermLength = randomIntBetween(1, 128);
@ -1021,9 +1092,11 @@ public class QueryAnalyzerTests extends ESTestCase {
            sumTermLength -= length;
        }
-        Set<QueryAnalyzer.QueryExtraction> result = selectBestExtraction(terms1, terms2);
+        Result result1 = new Result(true, terms1, 1);
        Result result2 = new Result(true, terms2, 1);
        Result result = selectBestResult(result1, result2);
        Set<QueryExtraction> expected = shortestTerms1Length >= shortestTerms2Length ? terms1 : terms2;
-        assertThat(result, sameInstance(expected));
+        assertThat(result.extractions, sameInstance(expected));
    }
    public void testPointRangeQuery() {
--- a/qa/query-builder-bwc/src/test/java/org/elasticsearch/bwc/QueryBuilderBWCIT.java
+++ b/qa/query-builder-bwc/src/test/java/org/elasticsearch/bwc/QueryBuilderBWCIT.java
@ -100,13 +100,12 @@ public class QueryBuilderBWCIT extends ESRestTestCase {
            new MatchPhraseQueryBuilder("keyword_field", "value").slop(3)
        );
        addCandidate("\"range\": { \"long_field\": {\"gte\": 1, \"lte\": 9}}", new RangeQueryBuilder("long_field").from(1).to(9));
-        // bug url https://github.com/elastic/elasticsearch/issues/29376
+        addCandidate(
        /*addCandidate(
            "\"bool\": { \"must_not\": [{\"match_all\": {}}], \"must\": [{\"match_all\": {}}], " +
                "\"filter\": [{\"match_all\": {}}], \"should\": [{\"match_all\": {}}]}",
            new BoolQueryBuilder().mustNot(new MatchAllQueryBuilder()).must(new MatchAllQueryBuilder())
                .filter(new MatchAllQueryBuilder()).should(new MatchAllQueryBuilder())
-        );*/
+        );
        addCandidate(
            "\"dis_max\": {\"queries\": [{\"match_all\": {}},{\"match_all\": {}},{\"match_all\": {}}], \"tie_breaker\": 0.01}",
            new DisMaxQueryBuilder().add(new MatchAllQueryBuilder()).add(new MatchAllQueryBuilder()).add(new MatchAllQueryBuilder())