Add a limit for graph phrase query expansion (#34031)

Today query parsers throw TooManyClauses exception when a query creates too many clauses. However graph phrase queries do not respect this limit. This change adds a protection against crazy expansions that can happen when building a graph phrase query. This is a temporary copy of the fix available in https://issues.apache.org/jira/browse/LUCENE-8479 but not merged yet. This logic will be removed when we integrate the Lucene patch in a future release.
2025-02-25 22:36:20 +00:00 · 2018-09-25 21:38:47 +02:00 · 2018-09-25 21:38:47 +02:00 · 0f878eff19
commit 0f878eff19
parent 1e6780d703
2 changed files with 157 additions and 1 deletions
--- a/server/src/main/java/org/elasticsearch/index/search/MatchQuery.java
+++ b/server/src/main/java/org/elasticsearch/index/search/MatchQuery.java
@ -44,6 +44,7 @@ import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.QueryBuilder;
+import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
@ -57,11 +58,14 @@ import org.elasticsearch.index.query.QueryShardContext;
 import org.elasticsearch.index.query.support.QueryParsers;

 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;

 import static org.elasticsearch.common.lucene.search.Queries.newLenientFieldQuery;
 import static org.elasticsearch.common.lucene.search.Queries.newUnmappedFieldQuery;

-public class    MatchQuery {
+public class MatchQuery {

    public enum Type implements Writeable {
        /**
@ -503,6 +507,82 @@ public class    MatchQuery {
            }
            return query;
        }
+
+        /**
+         * Overrides {@link QueryBuilder#analyzeGraphPhrase(TokenStream, String, int)} to add
+         * a limit (see {@link BooleanQuery#getMaxClauseCount()}) to the number of {@link SpanQuery}
+         * that this method can create.
+         *
+         * TODO Remove when https://issues.apache.org/jira/browse/LUCENE-8479 is fixed.
+         */
+        @Override
+        protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phraseSlop) throws IOException {
+            source.reset();
+            GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
+            List<SpanQuery> clauses = new ArrayList<>();
+            int[] articulationPoints = graph.articulationPoints();
+            int lastState = 0;
+            int maxBooleanClause = BooleanQuery.getMaxClauseCount();
+            for (int i = 0; i <= articulationPoints.length; i++) {
+                int start = lastState;
+                int end = -1;
+                if (i < articulationPoints.length) {
+                    end = articulationPoints[i];
+                }
+                lastState = end;
+                final SpanQuery queryPos;
+                if (graph.hasSidePath(start)) {
+                    List<SpanQuery> queries = new ArrayList<>();
+                    Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
+                    while (it.hasNext()) {
+                        TokenStream ts = it.next();
+                        SpanQuery q = createSpanQuery(ts, field);
+                        if (q != null) {
+                            if (queries.size() >= maxBooleanClause) {
+                                throw new BooleanQuery.TooManyClauses();
+                            }
+                            queries.add(q);
+                        }
+                    }
+                    if (queries.size() > 0) {
+                        queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0]));
+                    } else {
+                        queryPos = null;
+                    }
+                } else {
+                    Term[] terms = graph.getTerms(field, start);
+                    assert terms.length > 0;
+                    if (terms.length >= maxBooleanClause) {
+                        throw new BooleanQuery.TooManyClauses();
+                    }
+                    if (terms.length == 1) {
+                        queryPos = new SpanTermQuery(terms[0]);
+                    } else {
+                        SpanTermQuery[] orClauses = new SpanTermQuery[terms.length];
+                        for (int idx = 0; idx < terms.length; idx++) {
+                            orClauses[idx] = new SpanTermQuery(terms[idx]);
+                        }
+
+                        queryPos = new SpanOrQuery(orClauses);
+                    }
+                }
+
+                if (queryPos != null) {
+                    if (clauses.size() >= maxBooleanClause) {
+                        throw new BooleanQuery.TooManyClauses();
+                    }
+                    clauses.add(queryPos);
+                }
+            }
+
+            if (clauses.isEmpty()) {
+                return null;
+            } else if (clauses.size() == 1) {
+                return clauses.get(0);
+            } else {
+                return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), phraseSlop, true);
+            }
+        }
    }

    /**
--- a/server/src/test/java/org/elasticsearch/index/query/MatchQueryBuilderTests.java
+++ b/server/src/test/java/org/elasticsearch/index/query/MatchQueryBuilderTests.java
@ -19,6 +19,11 @@

 package org.elasticsearch.index.query;

+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CannedBinaryTokenStream;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.ExtendedCommonTermsQuery;
 import org.apache.lucene.search.BooleanClause;
@ -30,6 +35,7 @@ import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.PointRangeQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
 import org.elasticsearch.common.ParsingException;
 import org.elasticsearch.common.Strings;
@ -46,6 +52,8 @@ import org.elasticsearch.test.AbstractQueryTestCase;
 import org.hamcrest.Matcher;

 import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
@ -392,4 +400,72 @@ public class MatchQueryBuilderTests extends AbstractQueryTestCase<MatchQueryBuil
        assertThat(query.toString(),
            containsString("field:[string_no_pos] was indexed without position data; cannot run PhraseQuery"));
    }
+
+    public void testMaxBooleanClause() {
+        MatchQuery query = new MatchQuery(createShardContext());
+        query.setAnalyzer(new MockGraphAnalyzer(createGiantGraph(40)));
+        expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
+        query.setAnalyzer(new MockGraphAnalyzer(createGiantGraphMultiTerms()));
+        expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
+    }
+
+    private static class MockGraphAnalyzer extends Analyzer {
+        final CannedBinaryTokenStream.BinaryToken[] tokens;
+
+        private MockGraphAnalyzer(CannedBinaryTokenStream.BinaryToken[] tokens ) {
+            this.tokens = tokens;
+        }
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
+            return new TokenStreamComponents(tokenizer) {
+                @Override
+                public TokenStream getTokenStream() {
+                    return new CannedBinaryTokenStream(tokens);
+                }
+
+                @Override
+                protected void setReader(final Reader reader) {
+                }
+            };
+        }
+    }
+
+    /**
+     * Creates a graph token stream with 2 side paths at each position.
+     **/
+    private static CannedBinaryTokenStream.BinaryToken[] createGiantGraph(int numPos) {
+        List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
+        BytesRef term1 = new BytesRef("foo");
+        BytesRef term2 = new BytesRef("bar");
+        for (int i = 0; i < numPos;) {
+            if (i % 2 == 0) {
+                tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
+                tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
+                i += 2;
+            } else {
+                tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
+                i++;
+            }
+        }
+        return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
+    }
+
+    /**
+     * Creates a graph token stream with {@link BooleanQuery#getMaxClauseCount()}
+     * expansions at the last position.
+     **/
+    private static CannedBinaryTokenStream.BinaryToken[] createGiantGraphMultiTerms() {
+        List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
+        BytesRef term1 = new BytesRef("foo");
+        BytesRef term2 = new BytesRef("bar");
+        tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
+        tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
+        tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
+        tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
+        for (int i = 0; i < BooleanQuery.getMaxClauseCount(); i++) {
+            tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 1));
+        }
+        return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
+    }
 }