LUCENE-9207: Don't build span queries in QueryBuilder (#1239)

QueryBuilder currently has special logic for graph phrase queries with no slop,
constructing a spanquery that attempts to follow all paths using a combination of
OR and NEAR queries. However, this type of query has known bugs(LUCENE-7398).
This commit removes this logic and just builds a disjunction of phrase queries, one 
phrase per path.
This commit is contained in:
Alan Woodward 2020-02-26 14:32:34 +00:00 committed by GitHub
parent 852f02b4b7
commit 98dafe2e10
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 57 additions and 182 deletions

View File

@ -32,18 +32,13 @@ import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanBoostQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
import static org.apache.lucene.search.BoostAttribute.DEFAULT_BOOST;
/**
@ -367,40 +362,6 @@ public class QueryBuilder {
}
/**
* Creates a span query from the tokenstream. In the case of a single token, a simple <code>SpanTermQuery</code> is
* returned. When multiple tokens, an ordered <code>SpanNearQuery</code> with slop 0 is returned.
*/
protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOException {
TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class);
BoostAttribute boostAtt = in.addAttribute(BoostAttribute.class);
SpanQuery result;
float boost = DEFAULT_BOOST;
if (termAtt == null) {
return null;
}
List<SpanTermQuery> terms = new ArrayList<>();
while (in.incrementToken()) {
boost *= boostAtt.getBoost();
terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
}
if (terms.isEmpty()) {
return null;
} else if (terms.size() == 1) {
result = terms.get(0);
} else {
result = new SpanNearQuery(terms.toArray(new SpanQuery[0]), 0, true);
}
if (boost != DEFAULT_BOOST) {
result = new SpanBoostQuery(result, boost);
}
return result;
}
/**
* Creates simple term query from the cached tokenstream contents
*/
protected Query analyzeTerm(String field, TokenStream stream) throws IOException {
@ -595,90 +556,20 @@ public class QueryBuilder {
throws IOException {
source.reset();
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
if (phraseSlop > 0) {
/**
* Creates a boolean query from the graph token stream by extracting all the finite strings from the graph
* and using them to create phrase queries with the appropriate slop.
*/
BooleanQuery.Builder builder = new BooleanQuery.Builder();
Iterator<TokenStream> it = graph.getFiniteStrings();
while (it.hasNext()) {
Query query = createFieldQuery(it.next(), BooleanClause.Occur.MUST, field, true, phraseSlop);
if (query != null) {
builder.add(query, BooleanClause.Occur.SHOULD);
}
}
return builder.build();
}
/**
* Creates a span near (phrase) query from a graph token stream.
* The articulation points of the graph are visited in order and the queries
* created at each point are merged in the returned near query.
*/
List<SpanQuery> clauses = new ArrayList<>();
int[] articulationPoints = graph.articulationPoints();
int lastState = 0;
int maxClauseCount = IndexSearcher.getMaxClauseCount();
for (int i = 0; i <= articulationPoints.length; i++) {
int start = lastState;
int end = -1;
if (i < articulationPoints.length) {
end = articulationPoints[i];
}
lastState = end;
final SpanQuery queryPos;
if (graph.hasSidePath(start)) {
List<SpanQuery> queries = new ArrayList<>();
Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
while (it.hasNext()) {
TokenStream ts = it.next();
SpanQuery q = createSpanQuery(ts, field);
if (q != null) {
if (queries.size() >= maxClauseCount) {
throw new IndexSearcher.TooManyClauses();
}
queries.add(q);
}
}
if (queries.size() > 0) {
queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0]));
} else {
queryPos = null;
}
} else {
Term[] terms = graph.getTerms(field, start);
assert terms.length > 0;
if (terms.length == 1) {
queryPos = new SpanTermQuery(terms[0]);
} else {
if (terms.length >= maxClauseCount) {
throw new IndexSearcher.TooManyClauses();
}
SpanTermQuery[] orClauses = new SpanTermQuery[terms.length];
for (int idx = 0; idx < terms.length; idx++) {
orClauses[idx] = new SpanTermQuery(terms[idx]);
}
queryPos = new SpanOrQuery(orClauses);
}
}
if (queryPos != null) {
if (clauses.size() >= maxClauseCount) {
throw new IndexSearcher.TooManyClauses();
}
clauses.add(queryPos);
// Creates a boolean query from the graph token stream by extracting all the
// finite strings from the graph and using them to create phrase queries with
// the appropriate slop.
BooleanQuery.Builder builder = new BooleanQuery.Builder();
Iterator<TokenStream> it = graph.getFiniteStrings();
while (it.hasNext()) {
Query query = createFieldQuery(it.next(), BooleanClause.Occur.MUST, field, true, phraseSlop);
if (query != null) {
builder.add(query, BooleanClause.Occur.SHOULD);
}
}
return builder.build();
if (clauses.isEmpty()) {
return null;
} else if (clauses.size() == 1) {
return clauses.get(0);
} else {
return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), 0, true);
}
}
/**

View File

@ -41,10 +41,6 @@ import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
@ -163,18 +159,14 @@ public class TestQueryBuilder extends LuceneTestCase {
}
/** forms graph query */
public void testMultiWordSynonymsPhrase() throws Exception {
SpanNearQuery expectedNear = SpanNearQuery.newOrderedNearQuery("field")
.addClause(new SpanTermQuery(new Term("field", "guinea")))
.addClause(new SpanTermQuery(new Term("field", "pig")))
.setSlop(0)
public void testMultiWordSynonymsPhrase() {
Query expected = new BooleanQuery.Builder()
.add(new PhraseQuery("field", "guinea", "pig"), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("field", "cavy")), BooleanClause.Occur.SHOULD)
.build();
SpanTermQuery expectedTerm = new SpanTermQuery(new Term("field", "cavy"));
QueryBuilder queryBuilder = new QueryBuilder(new MockSynonymAnalyzer());
assertEquals(new SpanOrQuery(new SpanQuery[]{expectedNear, expectedTerm}),
queryBuilder.createPhraseQuery("field", "guinea pig"));
assertEquals(expected, queryBuilder.createPhraseQuery("field", "guinea pig"));
}
public void testMultiWordSynonymsPhraseWithSlop() throws Exception {

View File

@ -50,10 +50,6 @@ import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
@ -546,15 +542,11 @@ public class TestQueryParser extends QueryParserTestBase {
.build();
assertEquals(graphQuery, dumb.parse("guinea pig"));
// With the phrase operator, a multi-word synonym source will form span near queries.
SpanNearQuery spanGuineaPig = SpanNearQuery.newOrderedNearQuery("field")
.addClause(new SpanTermQuery(new Term("field", "guinea")))
.addClause(new SpanTermQuery(new Term("field", "pig")))
.setSlop(0)
Query synonyms = new BooleanQuery.Builder()
.add(new PhraseQuery("field", "guinea", "pig"), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("field", "cavy")), BooleanClause.Occur.SHOULD)
.build();
SpanTermQuery spanCavy = new SpanTermQuery(new Term("field", "cavy"));
SpanOrQuery spanPhrase = new SpanOrQuery(new SpanQuery[]{spanGuineaPig, spanCavy});
assertEquals(spanPhrase, dumb.parse("\"guinea pig\""));
assertEquals(synonyms, dumb.parse("\"guinea pig\""));
// custom behavior, the synonyms are expanded, unless you use quote operator
QueryParser smart = new SmartQueryParser();
@ -682,9 +674,9 @@ public class TestQueryParser extends QueryParserTestBase {
assertQueryEquals("guinea pig running?", a, "((+guinea +pig) cavy) running?");
assertQueryEquals("guinea pig \"running\"", a, "((+guinea +pig) cavy) running");
assertQueryEquals("\"guinea pig\"~2", a, "spanOr([spanNear([guinea, pig], 0, true), cavy])");
assertQueryEquals("\"guinea pig\"~2", a, "\"guinea pig\" cavy");
assertQueryEquals("field:\"guinea pig\"", a, "spanOr([spanNear([guinea, pig], 0, true), cavy])");
assertQueryEquals("field:\"guinea pig\"", a, "\"guinea pig\" cavy");
splitOnWhitespace = oldSplitOnWhitespace;
}
@ -761,9 +753,9 @@ public class TestQueryParser extends QueryParserTestBase {
assertQueryEquals("guinea pig running?", a, "guinea pig running?");
assertQueryEquals("guinea pig \"running\"", a, "guinea pig running");
assertQueryEquals("\"guinea pig\"~2", a, "spanOr([spanNear([guinea, pig], 0, true), cavy])");
assertQueryEquals("\"guinea pig\"~2", a, "\"guinea pig\" cavy");
assertQueryEquals("field:\"guinea pig\"", a, "spanOr([spanNear([guinea, pig], 0, true), cavy])");
assertQueryEquals("field:\"guinea pig\"", a, "\"guinea pig\" cavy");
splitOnWhitespace = oldSplitOnWhitespace;
}

View File

@ -1886,7 +1886,7 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
try (SolrQueryRequest req = req(sowTrueParams)) {
QParser qParser = QParser.getParser("text:grackle", "edismax", req);
Query q = qParser.getQuery();
assertEquals("+spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString());
assertEquals("+(text:\"crow blackbird\" text:grackl)", q.toString());
}
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams, sowFalseParams)) {
try (SolrQueryRequest req = req(params)) {
@ -1917,13 +1917,13 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
try (SolrQueryRequest req = req(sowTrueParams)) {
QParser qParser = QParser.getParser("grackle", "edismax", req);
Query q = qParser.getQuery();
assertEquals("+(spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])"
assertEquals("+((text:\"crow blackbird\" text:grackl)"
+ " | (((+text_sw:crow +text_sw:blackbird) text_sw:grackl)))",
q.toString());
qParser = QParser.getParser("grackle wi fi", "edismax", req);
q = qParser.getQuery();
assertEquals("+((spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])"
assertEquals("+(((text:\"crow blackbird\" text:grackl)"
+ " | (((+text_sw:crow +text_sw:blackbird) text_sw:grackl))) (text:wi | text_sw:wi) (text:fi | text_sw:fi))",
q.toString());
}

View File

@ -242,22 +242,22 @@ public class TestMultiWordSynonyms extends SolrTestCaseJ4 {
req(params("sow", "false","qf", "text^10","pf", "text^10","pf2", "text^5","pf3", "text^8"))).getQuery();
assertEquals("+(" +
"((text:foo)^10.0) ((text:a)^10.0) ((text:b)^10.0) (((+text:tropical +text:cyclone) text:bar)^10.0)) " +
"((spanNear([text:foo, text:a, text:b, spanOr([spanNear([text:tropical, text:cyclone], 0, true), text:bar])], 0, true))^10.0) " +
"(((text:\"foo a\")^5.0) ((text:\"a b\")^5.0) ((spanNear([text:b, spanOr([spanNear([text:tropical, text:cyclone], 0, true), text:bar])], 0, true))^5.0)) " +
"(((text:\"foo a b\")^8.0) ((spanNear([text:a, text:b, spanOr([spanNear([text:tropical, text:cyclone], 0, true), text:bar])], 0, true))^8.0))", q.toString());
"((text:\"foo a b tropical cyclone\" text:\"foo a b bar\")^10.0) " +
"(((text:\"foo a\")^5.0) ((text:\"a b\")^5.0) ((text:\"b tropical cyclone\" text:\"b bar\")^5.0)) " +
"(((text:\"foo a b\")^8.0) ((text:\"a b tropical cyclone\" text:\"a b bar\")^8.0))", q.toString());
q = QParser.getParser("tropical cyclone foo a b ","edismax",true, req(params("qf", "text^10","pf", "text^10","pf2", "text^5","pf3", "text^8"))).getQuery();
assertEquals("+(" +
"((text:bar (+text:tropical +text:cyclone))^10.0) ((text:foo)^10.0) ((text:a)^10.0) ((text:b)^10.0)) " +
"((spanNear([spanOr([text:bar, spanNear([text:tropical, text:cyclone], 0, true)]), text:foo, text:a, text:b], 0, true))^10.0) " +
"(((spanOr([text:bar, spanNear([text:tropical, text:cyclone], 0, true)]))^5.0) ((text:\"cyclone foo\")^5.0) ((text:\"foo a\")^5.0) ((text:\"a b\")^5.0)) " +
"(((spanNear([spanOr([text:bar, spanNear([text:tropical, text:cyclone], 0, true)]), text:foo], 0, true))^8.0) ((text:\"cyclone foo a\")^8.0) ((text:\"foo a b\")^8.0))", q.toString());
"((text:\"bar foo a b\" text:\"tropical cyclone foo a b\")^10.0) " +
"(((text:bar text:\"tropical cyclone\")^5.0) ((text:\"cyclone foo\")^5.0) ((text:\"foo a\")^5.0) ((text:\"a b\")^5.0)) " +
"(((text:\"bar foo\" text:\"tropical cyclone foo\")^8.0) ((text:\"cyclone foo a\")^8.0) ((text:\"foo a b\")^8.0))", q.toString());
q = QParser.getParser("foo a b tropical cyclone","edismax",true, req(params("qf", "text^10","pf", "text^10","pf2", "text^5","pf3", "text^8"))).getQuery();
assertEquals("+(" +
"((text:foo)^10.0) ((text:a)^10.0) ((text:b)^10.0) ((text:bar (+text:tropical +text:cyclone))^10.0)) " +
"((spanNear([text:foo, text:a, text:b, spanOr([text:bar, spanNear([text:tropical, text:cyclone], 0, true)])], 0, true))^10.0) " +
"(((text:\"foo a\")^5.0) ((text:\"a b\")^5.0) ((text:\"b tropical\")^5.0) ((spanOr([text:bar, spanNear([text:tropical, text:cyclone], 0, true)]))^5.0)) " +
"(((text:\"foo a b\")^8.0) ((text:\"a b tropical\")^8.0) ((spanNear([text:b, spanOr([text:bar, spanNear([text:tropical, text:cyclone], 0, true)])], 0, true))^8.0))", q.toString());
"((text:\"foo a b bar\" text:\"foo a b tropical cyclone\")^10.0) " +
"(((text:\"foo a\")^5.0) ((text:\"a b\")^5.0) ((text:\"b tropical\")^5.0) ((text:bar text:\"tropical cyclone\")^5.0)) " +
"(((text:\"foo a b\")^8.0) ((text:\"a b tropical\")^8.0) ((text:\"b bar\" text:\"b tropical cyclone\")^8.0))", q.toString());
}
}

View File

@ -1180,7 +1180,7 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
QParser qParser = QParser.getParser("text:grackle", req);
qParser.setParams(sowTrueParams);
Query q = qParser.getQuery();
assertEquals("spanOr([spanNear([text:crow, text:blackbird], 0, true), text:grackl])", q.toString());
assertEquals("text:\"crow blackbird\" text:grackl", q.toString());
for (SolrParams params : Arrays.asList(noSowParams, sowTrueParams, sowFalseParams)) {
qParser = QParser.getParser("text_sw:grackle", req); // "text_sw" doesn't specify autoGeneratePhraseQueries => default false
@ -1407,30 +1407,30 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
}
public void testSynonymsBoost_phraseQueryMultiTermSynonymsBoost_shouldParseBoostedSpanQuery() throws Exception {
public void testSynonymsBoost_phraseQueryMultiTermSynonymsBoost() throws Exception {
Query q = QParser.getParser("\"snow leopard lion\"", req(params("df", "t_pick_best_boosted_foo", "sow", "false"))).getQuery();
assertEquals("spanNear([" +
"spanOr([" +
"(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:uncia], 0, true))^0.9," +
" (spanNear([t_pick_best_boosted_foo:big, t_pick_best_boosted_foo:cat], 0, true))^0.8," +
" (t_pick_best_boosted_foo:white_leopard)^0.6," +
" spanNear([t_pick_best_boosted_foo:snow, t_pick_best_boosted_foo:leopard], 0, true)])," +
" spanOr([" +
"(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:leo], 0, true))^0.9," +
" (spanNear([t_pick_best_boosted_foo:simba, t_pick_best_boosted_foo:leo], 0, true))^0.8," +
" (t_pick_best_boosted_foo:kimba)^0.75])], 0, true)", q.toString());
assertEquals("(t_pick_best_boosted_foo:\"panthera uncia panthera leo\")^0.80999994 " +
"(t_pick_best_boosted_foo:\"panthera uncia simba leo\")^0.71999997 " +
"(t_pick_best_boosted_foo:\"panthera uncia kimba\")^0.67499995 " +
"(t_pick_best_boosted_foo:\"big cat panthera leo\")^0.71999997 " +
"(t_pick_best_boosted_foo:\"big cat simba leo\")^0.64000005 " +
"(t_pick_best_boosted_foo:\"big cat kimba\")^0.6 " +
"(t_pick_best_boosted_foo:\"white_leopard panthera leo\")^0.54 " +
"(t_pick_best_boosted_foo:\"white_leopard simba leo\")^0.48000002 " +
"(t_pick_best_boosted_foo:\"white_leopard kimba\")^0.45000002 " +
"(t_pick_best_boosted_foo:\"snow leopard panthera leo\")^0.9 " +
"(t_pick_best_boosted_foo:\"snow leopard simba leo\")^0.8 " +
"(t_pick_best_boosted_foo:\"snow leopard kimba\")^0.75", q.toString());
}
public void testSynonymsBoost_phraseQueryMultiTermSynonymsMultipleBoost_shouldParseMultiplicativeBoostedSpanQuery() throws Exception {
public void testSynonymsBoost_phraseQueryMultiTermSynonymsMultipleBoost() throws Exception {
Query q = QParser.getParser("\"panthera blytheae lion\"", req(params("df", "t_pick_best_boosted_foo", "sow", "false"))).getQuery();
assertEquals("spanNear([" +
"spanOr([" +
"(spanNear([t_pick_best_boosted_foo:oldest, t_pick_best_boosted_foo:ancient, t_pick_best_boosted_foo:panthera], 0, true))^0.45," +
" spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:blytheae], 0, true)])," +
" spanOr([" +
"(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:leo], 0, true))^0.9," +
" (spanNear([t_pick_best_boosted_foo:simba, t_pick_best_boosted_foo:leo], 0, true))^0.8," +
" (t_pick_best_boosted_foo:kimba)^0.75])], 0, true)", q.toString());
assertEquals("(t_pick_best_boosted_foo:\"oldest ancient panthera panthera leo\")^0.40499997 " +
"(t_pick_best_boosted_foo:\"oldest ancient panthera simba leo\")^0.35999998 " +
"(t_pick_best_boosted_foo:\"oldest ancient panthera kimba\")^0.33749998 " +
"(t_pick_best_boosted_foo:\"panthera blytheae panthera leo\")^0.9 " +
"(t_pick_best_boosted_foo:\"panthera blytheae simba leo\")^0.8 " +
"(t_pick_best_boosted_foo:\"panthera blytheae kimba\")^0.75", q.toString());
}
public void testSynonymsBoost_BoostMissing_shouldAssignDefaultBoost() throws Exception {