LUCENE-7699: Query parsers now use span queries to produce more efficient phrase queries for multi-token synonyms.

This commit is contained in:
Jim Ferenczi 2017-02-22 14:05:21 +01:00
parent b9c9cddff7
commit 96e8f0a0af
4 changed files with 122 additions and 30 deletions

View File

@ -196,6 +196,9 @@ Optimizations
points (or cut vertices) in order to create more efficient queries for
multi-token synonyms. (Jim Ferenczi)
* LUCENE-7699: Query parsers now use span queries to produce more efficient
phrase queries for multi-token synonyms. (Matt Webber via Jim Ferenczi)
Build
* LUCENE-7653: Update randomizedtesting to version 2.5.0. (Dawid Weiss)

View File

@ -37,6 +37,10 @@ import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
/**
@ -313,7 +317,7 @@ public class QueryBuilder {
} else if (isGraph) {
// graph
if (quoted) {
return analyzeGraphPhrase(stream, operator, field, phraseSlop);
return analyzeGraphPhrase(stream, field, phraseSlop);
} else {
return analyzeGraphBoolean(field, stream, operator);
}
@ -340,7 +344,31 @@ public class QueryBuilder {
throw new RuntimeException("Error analyzing query text", e);
}
}
/**
* Creates a span query from the tokenstream. In the case of a single token, a simple <code>SpanTermQuery</code> is
* returned. When multiple tokens, an ordered <code>SpanNearQuery</code> with slop of 0 is returned.
*/
protected final SpanQuery createSpanQuery(TokenStream in, String field) throws IOException {
TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class);
if (termAtt == null) {
return null;
}
List<SpanTermQuery> terms = new ArrayList<>();
while (in.incrementToken()) {
terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
}
if (terms.isEmpty()) {
return null;
} else if (terms.size() == 1) {
return terms.get(0);
} else {
return new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true);
}
}
/**
* Creates simple term query from the cached tokenstream contents
*/
@ -520,21 +548,66 @@ public class QueryBuilder {
}
/**
* Creates a query from a graph token stream by extracting all the finite strings from the graph and using them to create the query.
* Creates a span near (phrase) query from a graph token stream. The articulation points of the graph are visited in
* order and the queries created at each point are merged in the returned near query.
*/
protected Query analyzeGraphPhrase(TokenStream source, BooleanClause.Occur operator, String field, int phraseSlop)
protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phraseSlop)
throws IOException {
source.reset();
GraphTokenStreamFiniteStrings visitor = new GraphTokenStreamFiniteStrings(source);
Iterator<TokenStream> it = visitor.getFiniteStrings();
List<Query> queries = new ArrayList<>();
while (it.hasNext()) {
Query query = createFieldQuery(it.next(), operator, field, true, phraseSlop);
if (query != null) {
queries.add(query);
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
List<SpanQuery> clauses = new ArrayList<>();
int[] articulationPoints = graph.articulationPoints();
int lastState = 0;
for (int i = 0; i <= articulationPoints.length; i++) {
int start = lastState;
int end = -1;
if (i < articulationPoints.length) {
end = articulationPoints[i];
}
lastState = end;
final SpanQuery queryPos;
if (graph.hasSidePath(start)) {
List<SpanQuery> queries = new ArrayList<>();
Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
while (it.hasNext()) {
TokenStream ts = it.next();
SpanQuery q = createSpanQuery(ts, field);
if (q != null) {
queries.add(q);
}
}
if (queries.size() > 0) {
queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0]));
} else {
queryPos = null;
}
} else {
Term[] terms = graph.getTerms(field, start);
assert terms.length > 0;
if (terms.length == 1) {
queryPos = new SpanTermQuery(terms[0]);
} else {
SpanTermQuery[] orClauses = new SpanTermQuery[terms.length];
for (int idx = 0; idx < terms.length; idx++) {
orClauses[idx] = new SpanTermQuery(terms[idx]);
}
queryPos = new SpanOrQuery(orClauses);
}
}
if (queryPos != null) {
clauses.add(queryPos);
}
}
return new GraphQuery(queries.toArray(new Query[0]));
if (clauses.isEmpty()) {
return null;
} else if (clauses.size() == 1) {
return clauses.get(0);
} else {
return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), phraseSlop, true);
}
}
/**

View File

@ -37,6 +37,10 @@ import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
@ -153,14 +157,16 @@ public class TestQueryBuilder extends LuceneTestCase {
/** forms graph query */
public void testMultiWordSynonymsPhrase() throws Exception {
PhraseQuery.Builder expectedPhrase = new PhraseQuery.Builder();
expectedPhrase.add(new Term("field", "guinea"));
expectedPhrase.add(new Term("field", "pig"));
SpanNearQuery expectedNear = SpanNearQuery.newOrderedNearQuery("field")
.addClause(new SpanTermQuery(new Term("field", "guinea")))
.addClause(new SpanTermQuery(new Term("field", "pig")))
.setSlop(0)
.build();
TermQuery expectedTerm = new TermQuery(new Term("field", "cavy"));
SpanTermQuery expectedTerm = new SpanTermQuery(new Term("field", "cavy"));
QueryBuilder queryBuilder = new QueryBuilder(new MockSynonymAnalyzer());
assertEquals(new GraphQuery(expectedPhrase.build(), expectedTerm),
assertEquals(new SpanOrQuery(new SpanQuery[]{expectedNear, expectedTerm}),
queryBuilder.createPhraseQuery("field", "guinea pig"));
}

View File

@ -51,6 +51,10 @@ import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
@ -513,24 +517,30 @@ public class TestQueryParser extends QueryParserTestBase {
synonym.add(pig, BooleanClause.Occur.MUST);
BooleanQuery guineaPig = synonym.build();
PhraseQuery phraseGuineaPig = new PhraseQuery.Builder()
.add(new Term("field", "guinea"))
.add(new Term("field", "pig"))
.build();
GraphQuery graphQuery = new GraphQuery(guineaPig, cavy);
assertEquals(graphQuery, dumb.parse("guinea pig"));
// With the phrase operator, a multi-word synonym source will form a graph query with inner phrase queries.
PhraseQuery.Builder phraseSynonym = new PhraseQuery.Builder();
phraseSynonym.add(new Term("field", "guinea"));
phraseSynonym.add(new Term("field", "pig"));
PhraseQuery guineaPigPhrase = phraseSynonym.build();
graphQuery = new GraphQuery(guineaPigPhrase, cavy);
assertEquals(graphQuery, dumb.parse("\"guinea pig\""));
// With the phrase operator, a multi-word synonym source will form span near queries.
SpanNearQuery spanGuineaPig = SpanNearQuery.newOrderedNearQuery("field")
.addClause(new SpanTermQuery(new Term("field", "guinea")))
.addClause(new SpanTermQuery(new Term("field", "pig")))
.setSlop(0)
.build();
SpanTermQuery spanCavy = new SpanTermQuery(new Term("field", "cavy"));
SpanOrQuery spanPhrase = new SpanOrQuery(new SpanQuery[]{spanGuineaPig, spanCavy});
assertEquals(spanPhrase, dumb.parse("\"guinea pig\""));
// custom behavior, the synonyms are expanded, unless you use quote operator
QueryParser smart = new SmartQueryParser();
smart.setSplitOnWhitespace(false);
graphQuery = new GraphQuery(guineaPig, cavy);
assertEquals(graphQuery, smart.parse("guinea pig"));
assertEquals(guineaPigPhrase, smart.parse("\"guinea pig\""));
assertEquals(phraseGuineaPig, smart.parse("\"guinea pig\""));
}
public void testEnableGraphQueries() throws Exception {
@ -626,9 +636,9 @@ public class TestQueryParser extends QueryParserTestBase {
assertQueryEquals("guinea pig running?", a, "Graph(+field:guinea +field:pig, field:cavy, hasBoolean=true, hasPhrase=false) running?");
assertQueryEquals("guinea pig \"running\"", a, "Graph(+field:guinea +field:pig, field:cavy, hasBoolean=true, hasPhrase=false) running");
assertQueryEquals("\"guinea pig\"~2", a, "Graph(field:\"guinea pig\"~2, field:cavy, hasBoolean=false, hasPhrase=true)");
assertQueryEquals("\"guinea pig\"~2", a, "spanOr([spanNear([guinea, pig], 0, true), cavy])");
assertQueryEquals("field:\"guinea pig\"", a, "Graph(field:\"guinea pig\", field:cavy, hasBoolean=false, hasPhrase=true)");
assertQueryEquals("field:\"guinea pig\"", a, "spanOr([spanNear([guinea, pig], 0, true), cavy])");
splitOnWhitespace = oldSplitOnWhitespace;
}
@ -705,9 +715,9 @@ public class TestQueryParser extends QueryParserTestBase {
assertQueryEquals("guinea pig running?", a, "guinea pig running?");
assertQueryEquals("guinea pig \"running\"", a, "guinea pig running");
assertQueryEquals("\"guinea pig\"~2", a, "Graph(field:\"guinea pig\"~2, field:cavy, hasBoolean=false, hasPhrase=true)");
assertQueryEquals("\"guinea pig\"~2", a, "spanOr([spanNear([guinea, pig], 0, true), cavy])");
assertQueryEquals("field:\"guinea pig\"", a, "Graph(field:\"guinea pig\", field:cavy, hasBoolean=false, hasPhrase=true)");
assertQueryEquals("field:\"guinea pig\"", a, "spanOr([spanNear([guinea, pig], 0, true), cavy])");
splitOnWhitespace = oldSplitOnWhitespace;
}