Add a limit for graph phrase query expansion (#34031)

Today query parsers throw TooManyClauses exception when a query creates
too many clauses. However graph phrase queries do not respect this limit.
This change adds a protection against crazy expansions that can happen when
building a graph phrase query. This is a temporary copy of the fix available
in https://issues.apache.org/jira/browse/LUCENE-8479 but not merged yet.
This logic will be removed when we integrate the Lucene patch in a future
release.
This commit is contained in:
Jim Ferenczi 2018-09-25 21:38:47 +02:00 committed by GitHub
parent 1e6780d703
commit 0f878eff19
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 157 additions and 1 deletions

View File

@ -44,6 +44,7 @@ import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
@ -57,11 +58,14 @@ import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.support.QueryParsers;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import static org.elasticsearch.common.lucene.search.Queries.newLenientFieldQuery;
import static org.elasticsearch.common.lucene.search.Queries.newUnmappedFieldQuery;
public class MatchQuery {
public class MatchQuery {
public enum Type implements Writeable {
/**
@ -503,6 +507,82 @@ public class MatchQuery {
}
return query;
}
/**
* Overrides {@link QueryBuilder#analyzeGraphPhrase(TokenStream, String, int)} to add
* a limit (see {@link BooleanQuery#getMaxClauseCount()}) to the number of {@link SpanQuery}
* that this method can create.
*
* TODO Remove when https://issues.apache.org/jira/browse/LUCENE-8479 is fixed.
*/
@Override
protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phraseSlop) throws IOException {
source.reset();
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
List<SpanQuery> clauses = new ArrayList<>();
int[] articulationPoints = graph.articulationPoints();
int lastState = 0;
int maxBooleanClause = BooleanQuery.getMaxClauseCount();
for (int i = 0; i <= articulationPoints.length; i++) {
int start = lastState;
int end = -1;
if (i < articulationPoints.length) {
end = articulationPoints[i];
}
lastState = end;
final SpanQuery queryPos;
if (graph.hasSidePath(start)) {
List<SpanQuery> queries = new ArrayList<>();
Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
while (it.hasNext()) {
TokenStream ts = it.next();
SpanQuery q = createSpanQuery(ts, field);
if (q != null) {
if (queries.size() >= maxBooleanClause) {
throw new BooleanQuery.TooManyClauses();
}
queries.add(q);
}
}
if (queries.size() > 0) {
queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0]));
} else {
queryPos = null;
}
} else {
Term[] terms = graph.getTerms(field, start);
assert terms.length > 0;
if (terms.length >= maxBooleanClause) {
throw new BooleanQuery.TooManyClauses();
}
if (terms.length == 1) {
queryPos = new SpanTermQuery(terms[0]);
} else {
SpanTermQuery[] orClauses = new SpanTermQuery[terms.length];
for (int idx = 0; idx < terms.length; idx++) {
orClauses[idx] = new SpanTermQuery(terms[idx]);
}
queryPos = new SpanOrQuery(orClauses);
}
}
if (queryPos != null) {
if (clauses.size() >= maxBooleanClause) {
throw new BooleanQuery.TooManyClauses();
}
clauses.add(queryPos);
}
}
if (clauses.isEmpty()) {
return null;
} else if (clauses.size() == 1) {
return clauses.get(0);
} else {
return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), phraseSlop, true);
}
}
}
/**

View File

@ -19,6 +19,11 @@
package org.elasticsearch.index.query;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedBinaryTokenStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.ExtendedCommonTermsQuery;
import org.apache.lucene.search.BooleanClause;
@ -30,6 +35,7 @@ import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.PointRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.Strings;
@ -46,6 +52,8 @@ import org.elasticsearch.test.AbstractQueryTestCase;
import org.hamcrest.Matcher;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
@ -392,4 +400,72 @@ public class MatchQueryBuilderTests extends AbstractQueryTestCase<MatchQueryBuil
assertThat(query.toString(),
containsString("field:[string_no_pos] was indexed without position data; cannot run PhraseQuery"));
}
public void testMaxBooleanClause() {
MatchQuery query = new MatchQuery(createShardContext());
query.setAnalyzer(new MockGraphAnalyzer(createGiantGraph(40)));
expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
query.setAnalyzer(new MockGraphAnalyzer(createGiantGraphMultiTerms()));
expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
}
private static class MockGraphAnalyzer extends Analyzer {
final CannedBinaryTokenStream.BinaryToken[] tokens;
private MockGraphAnalyzer(CannedBinaryTokenStream.BinaryToken[] tokens ) {
this.tokens = tokens;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
return new TokenStreamComponents(tokenizer) {
@Override
public TokenStream getTokenStream() {
return new CannedBinaryTokenStream(tokens);
}
@Override
protected void setReader(final Reader reader) {
}
};
}
}
/**
* Creates a graph token stream with 2 side paths at each position.
**/
private static CannedBinaryTokenStream.BinaryToken[] createGiantGraph(int numPos) {
List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
BytesRef term1 = new BytesRef("foo");
BytesRef term2 = new BytesRef("bar");
for (int i = 0; i < numPos;) {
if (i % 2 == 0) {
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
i += 2;
} else {
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
i++;
}
}
return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
}
/**
* Creates a graph token stream with {@link BooleanQuery#getMaxClauseCount()}
* expansions at the last position.
**/
private static CannedBinaryTokenStream.BinaryToken[] createGiantGraphMultiTerms() {
List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
BytesRef term1 = new BytesRef("foo");
BytesRef term2 = new BytesRef("bar");
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
for (int i = 0; i < BooleanQuery.getMaxClauseCount(); i++) {
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 1));
}
return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
}
}