Add a limit for graph phrase query expansion (#34031)
Today query parsers throw TooManyClauses exception when a query creates too many clauses. However graph phrase queries do not respect this limit. This change adds a protection against crazy expansions that can happen when building a graph phrase query. This is a temporary copy of the fix available in https://issues.apache.org/jira/browse/LUCENE-8479 but not merged yet. This logic will be removed when we integrate the Lucene patch in a future release.
This commit is contained in:
parent
1e6780d703
commit
0f878eff19
|
@ -44,6 +44,7 @@ import org.apache.lucene.search.spans.SpanQuery;
|
|||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.QueryBuilder;
|
||||
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
|
@ -57,11 +58,14 @@ import org.elasticsearch.index.query.QueryShardContext;
|
|||
import org.elasticsearch.index.query.support.QueryParsers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import static org.elasticsearch.common.lucene.search.Queries.newLenientFieldQuery;
|
||||
import static org.elasticsearch.common.lucene.search.Queries.newUnmappedFieldQuery;
|
||||
|
||||
public class MatchQuery {
|
||||
public class MatchQuery {
|
||||
|
||||
public enum Type implements Writeable {
|
||||
/**
|
||||
|
@ -503,6 +507,82 @@ public class MatchQuery {
|
|||
}
|
||||
return query;
|
||||
}
|
||||
|
||||
/**
|
||||
* Overrides {@link QueryBuilder#analyzeGraphPhrase(TokenStream, String, int)} to add
|
||||
* a limit (see {@link BooleanQuery#getMaxClauseCount()}) to the number of {@link SpanQuery}
|
||||
* that this method can create.
|
||||
*
|
||||
* TODO Remove when https://issues.apache.org/jira/browse/LUCENE-8479 is fixed.
|
||||
*/
|
||||
@Override
|
||||
protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phraseSlop) throws IOException {
|
||||
source.reset();
|
||||
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
|
||||
List<SpanQuery> clauses = new ArrayList<>();
|
||||
int[] articulationPoints = graph.articulationPoints();
|
||||
int lastState = 0;
|
||||
int maxBooleanClause = BooleanQuery.getMaxClauseCount();
|
||||
for (int i = 0; i <= articulationPoints.length; i++) {
|
||||
int start = lastState;
|
||||
int end = -1;
|
||||
if (i < articulationPoints.length) {
|
||||
end = articulationPoints[i];
|
||||
}
|
||||
lastState = end;
|
||||
final SpanQuery queryPos;
|
||||
if (graph.hasSidePath(start)) {
|
||||
List<SpanQuery> queries = new ArrayList<>();
|
||||
Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
|
||||
while (it.hasNext()) {
|
||||
TokenStream ts = it.next();
|
||||
SpanQuery q = createSpanQuery(ts, field);
|
||||
if (q != null) {
|
||||
if (queries.size() >= maxBooleanClause) {
|
||||
throw new BooleanQuery.TooManyClauses();
|
||||
}
|
||||
queries.add(q);
|
||||
}
|
||||
}
|
||||
if (queries.size() > 0) {
|
||||
queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0]));
|
||||
} else {
|
||||
queryPos = null;
|
||||
}
|
||||
} else {
|
||||
Term[] terms = graph.getTerms(field, start);
|
||||
assert terms.length > 0;
|
||||
if (terms.length >= maxBooleanClause) {
|
||||
throw new BooleanQuery.TooManyClauses();
|
||||
}
|
||||
if (terms.length == 1) {
|
||||
queryPos = new SpanTermQuery(terms[0]);
|
||||
} else {
|
||||
SpanTermQuery[] orClauses = new SpanTermQuery[terms.length];
|
||||
for (int idx = 0; idx < terms.length; idx++) {
|
||||
orClauses[idx] = new SpanTermQuery(terms[idx]);
|
||||
}
|
||||
|
||||
queryPos = new SpanOrQuery(orClauses);
|
||||
}
|
||||
}
|
||||
|
||||
if (queryPos != null) {
|
||||
if (clauses.size() >= maxBooleanClause) {
|
||||
throw new BooleanQuery.TooManyClauses();
|
||||
}
|
||||
clauses.add(queryPos);
|
||||
}
|
||||
}
|
||||
|
||||
if (clauses.isEmpty()) {
|
||||
return null;
|
||||
} else if (clauses.size() == 1) {
|
||||
return clauses.get(0);
|
||||
} else {
|
||||
return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), phraseSlop, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -19,6 +19,11 @@
|
|||
|
||||
package org.elasticsearch.index.query;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CannedBinaryTokenStream;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.ExtendedCommonTermsQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
|
@ -30,6 +35,7 @@ import org.apache.lucene.search.MatchNoDocsQuery;
|
|||
import org.apache.lucene.search.PointRangeQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
|
||||
import org.elasticsearch.common.ParsingException;
|
||||
import org.elasticsearch.common.Strings;
|
||||
|
@ -46,6 +52,8 @@ import org.elasticsearch.test.AbstractQueryTestCase;
|
|||
import org.hamcrest.Matcher;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
@ -392,4 +400,72 @@ public class MatchQueryBuilderTests extends AbstractQueryTestCase<MatchQueryBuil
|
|||
assertThat(query.toString(),
|
||||
containsString("field:[string_no_pos] was indexed without position data; cannot run PhraseQuery"));
|
||||
}
|
||||
|
||||
public void testMaxBooleanClause() {
|
||||
MatchQuery query = new MatchQuery(createShardContext());
|
||||
query.setAnalyzer(new MockGraphAnalyzer(createGiantGraph(40)));
|
||||
expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
|
||||
query.setAnalyzer(new MockGraphAnalyzer(createGiantGraphMultiTerms()));
|
||||
expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
|
||||
}
|
||||
|
||||
private static class MockGraphAnalyzer extends Analyzer {
|
||||
final CannedBinaryTokenStream.BinaryToken[] tokens;
|
||||
|
||||
private MockGraphAnalyzer(CannedBinaryTokenStream.BinaryToken[] tokens ) {
|
||||
this.tokens = tokens;
|
||||
}
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
|
||||
return new TokenStreamComponents(tokenizer) {
|
||||
@Override
|
||||
public TokenStream getTokenStream() {
|
||||
return new CannedBinaryTokenStream(tokens);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setReader(final Reader reader) {
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a graph token stream with 2 side paths at each position.
|
||||
**/
|
||||
private static CannedBinaryTokenStream.BinaryToken[] createGiantGraph(int numPos) {
|
||||
List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
|
||||
BytesRef term1 = new BytesRef("foo");
|
||||
BytesRef term2 = new BytesRef("bar");
|
||||
for (int i = 0; i < numPos;) {
|
||||
if (i % 2 == 0) {
|
||||
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
|
||||
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
|
||||
i += 2;
|
||||
} else {
|
||||
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a graph token stream with {@link BooleanQuery#getMaxClauseCount()}
|
||||
* expansions at the last position.
|
||||
**/
|
||||
private static CannedBinaryTokenStream.BinaryToken[] createGiantGraphMultiTerms() {
|
||||
List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
|
||||
BytesRef term1 = new BytesRef("foo");
|
||||
BytesRef term2 = new BytesRef("bar");
|
||||
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
|
||||
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
|
||||
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
|
||||
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
|
||||
for (int i = 0; i < BooleanQuery.getMaxClauseCount(); i++) {
|
||||
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 1));
|
||||
}
|
||||
return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue