mirror of https://github.com/apache/lucene.git
LUCENE-7699: Query parsers now use span queries to produce more efficient phrase queries for multi-token synonyms.
This commit is contained in:
parent
b9c9cddff7
commit
96e8f0a0af
|
@ -196,6 +196,9 @@ Optimizations
|
|||
points (or cut vertices) in order to create more efficient queries for
|
||||
multi-token synonyms. (Jim Ferenczi)
|
||||
|
||||
* LUCENE-7699: Query parsers now use span queries to produce more efficient
|
||||
phrase queries for multi-token synonyms. (Matt Webber via Jim Ferenczi)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-7653: Update randomizedtesting to version 2.5.0. (Dawid Weiss)
|
||||
|
|
|
@ -37,6 +37,10 @@ import org.apache.lucene.search.PhraseQuery;
|
|||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.SynonymQuery;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
|
||||
|
||||
/**
|
||||
|
@ -313,7 +317,7 @@ public class QueryBuilder {
|
|||
} else if (isGraph) {
|
||||
// graph
|
||||
if (quoted) {
|
||||
return analyzeGraphPhrase(stream, operator, field, phraseSlop);
|
||||
return analyzeGraphPhrase(stream, field, phraseSlop);
|
||||
} else {
|
||||
return analyzeGraphBoolean(field, stream, operator);
|
||||
}
|
||||
|
@ -340,7 +344,31 @@ public class QueryBuilder {
|
|||
throw new RuntimeException("Error analyzing query text", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a span query from the tokenstream. In the case of a single token, a simple <code>SpanTermQuery</code> is
|
||||
* returned. When multiple tokens, an ordered <code>SpanNearQuery</code> with slop of 0 is returned.
|
||||
*/
|
||||
protected final SpanQuery createSpanQuery(TokenStream in, String field) throws IOException {
|
||||
TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class);
|
||||
if (termAtt == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<SpanTermQuery> terms = new ArrayList<>();
|
||||
while (in.incrementToken()) {
|
||||
terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
|
||||
}
|
||||
|
||||
if (terms.isEmpty()) {
|
||||
return null;
|
||||
} else if (terms.size() == 1) {
|
||||
return terms.get(0);
|
||||
} else {
|
||||
return new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates simple term query from the cached tokenstream contents
|
||||
*/
|
||||
|
@ -520,21 +548,66 @@ public class QueryBuilder {
|
|||
}
|
||||
|
||||
/**
|
||||
* Creates a query from a graph token stream by extracting all the finite strings from the graph and using them to create the query.
|
||||
* Creates a span near (phrase) query from a graph token stream. The articulation points of the graph are visited in
|
||||
* order and the queries created at each point are merged in the returned near query.
|
||||
*/
|
||||
protected Query analyzeGraphPhrase(TokenStream source, BooleanClause.Occur operator, String field, int phraseSlop)
|
||||
protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phraseSlop)
|
||||
throws IOException {
|
||||
source.reset();
|
||||
GraphTokenStreamFiniteStrings visitor = new GraphTokenStreamFiniteStrings(source);
|
||||
Iterator<TokenStream> it = visitor.getFiniteStrings();
|
||||
List<Query> queries = new ArrayList<>();
|
||||
while (it.hasNext()) {
|
||||
Query query = createFieldQuery(it.next(), operator, field, true, phraseSlop);
|
||||
if (query != null) {
|
||||
queries.add(query);
|
||||
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
|
||||
List<SpanQuery> clauses = new ArrayList<>();
|
||||
int[] articulationPoints = graph.articulationPoints();
|
||||
int lastState = 0;
|
||||
for (int i = 0; i <= articulationPoints.length; i++) {
|
||||
int start = lastState;
|
||||
int end = -1;
|
||||
if (i < articulationPoints.length) {
|
||||
end = articulationPoints[i];
|
||||
}
|
||||
lastState = end;
|
||||
final SpanQuery queryPos;
|
||||
if (graph.hasSidePath(start)) {
|
||||
List<SpanQuery> queries = new ArrayList<>();
|
||||
Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
|
||||
while (it.hasNext()) {
|
||||
TokenStream ts = it.next();
|
||||
SpanQuery q = createSpanQuery(ts, field);
|
||||
if (q != null) {
|
||||
queries.add(q);
|
||||
}
|
||||
}
|
||||
if (queries.size() > 0) {
|
||||
queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0]));
|
||||
} else {
|
||||
queryPos = null;
|
||||
}
|
||||
} else {
|
||||
Term[] terms = graph.getTerms(field, start);
|
||||
assert terms.length > 0;
|
||||
if (terms.length == 1) {
|
||||
queryPos = new SpanTermQuery(terms[0]);
|
||||
} else {
|
||||
SpanTermQuery[] orClauses = new SpanTermQuery[terms.length];
|
||||
for (int idx = 0; idx < terms.length; idx++) {
|
||||
orClauses[idx] = new SpanTermQuery(terms[idx]);
|
||||
}
|
||||
|
||||
queryPos = new SpanOrQuery(orClauses);
|
||||
}
|
||||
}
|
||||
|
||||
if (queryPos != null) {
|
||||
clauses.add(queryPos);
|
||||
}
|
||||
}
|
||||
return new GraphQuery(queries.toArray(new Query[0]));
|
||||
|
||||
if (clauses.isEmpty()) {
|
||||
return null;
|
||||
} else if (clauses.size() == 1) {
|
||||
return clauses.get(0);
|
||||
} else {
|
||||
return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), phraseSlop, true);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -37,6 +37,10 @@ import org.apache.lucene.search.PhraseQuery;
|
|||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.SynonymQuery;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
|
@ -153,14 +157,16 @@ public class TestQueryBuilder extends LuceneTestCase {
|
|||
|
||||
/** forms graph query */
|
||||
public void testMultiWordSynonymsPhrase() throws Exception {
|
||||
PhraseQuery.Builder expectedPhrase = new PhraseQuery.Builder();
|
||||
expectedPhrase.add(new Term("field", "guinea"));
|
||||
expectedPhrase.add(new Term("field", "pig"));
|
||||
SpanNearQuery expectedNear = SpanNearQuery.newOrderedNearQuery("field")
|
||||
.addClause(new SpanTermQuery(new Term("field", "guinea")))
|
||||
.addClause(new SpanTermQuery(new Term("field", "pig")))
|
||||
.setSlop(0)
|
||||
.build();
|
||||
|
||||
TermQuery expectedTerm = new TermQuery(new Term("field", "cavy"));
|
||||
SpanTermQuery expectedTerm = new SpanTermQuery(new Term("field", "cavy"));
|
||||
|
||||
QueryBuilder queryBuilder = new QueryBuilder(new MockSynonymAnalyzer());
|
||||
assertEquals(new GraphQuery(expectedPhrase.build(), expectedTerm),
|
||||
assertEquals(new SpanOrQuery(new SpanQuery[]{expectedNear, expectedTerm}),
|
||||
queryBuilder.createPhraseQuery("field", "guinea pig"));
|
||||
}
|
||||
|
||||
|
|
|
@ -51,6 +51,10 @@ import org.apache.lucene.search.PhraseQuery;
|
|||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.SynonymQuery;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
|
||||
|
||||
|
@ -513,24 +517,30 @@ public class TestQueryParser extends QueryParserTestBase {
|
|||
synonym.add(pig, BooleanClause.Occur.MUST);
|
||||
BooleanQuery guineaPig = synonym.build();
|
||||
|
||||
PhraseQuery phraseGuineaPig = new PhraseQuery.Builder()
|
||||
.add(new Term("field", "guinea"))
|
||||
.add(new Term("field", "pig"))
|
||||
.build();
|
||||
|
||||
GraphQuery graphQuery = new GraphQuery(guineaPig, cavy);
|
||||
assertEquals(graphQuery, dumb.parse("guinea pig"));
|
||||
|
||||
// With the phrase operator, a multi-word synonym source will form a graph query with inner phrase queries.
|
||||
PhraseQuery.Builder phraseSynonym = new PhraseQuery.Builder();
|
||||
phraseSynonym.add(new Term("field", "guinea"));
|
||||
phraseSynonym.add(new Term("field", "pig"));
|
||||
PhraseQuery guineaPigPhrase = phraseSynonym.build();
|
||||
|
||||
graphQuery = new GraphQuery(guineaPigPhrase, cavy);
|
||||
assertEquals(graphQuery, dumb.parse("\"guinea pig\""));
|
||||
// With the phrase operator, a multi-word synonym source will form span near queries.
|
||||
SpanNearQuery spanGuineaPig = SpanNearQuery.newOrderedNearQuery("field")
|
||||
.addClause(new SpanTermQuery(new Term("field", "guinea")))
|
||||
.addClause(new SpanTermQuery(new Term("field", "pig")))
|
||||
.setSlop(0)
|
||||
.build();
|
||||
SpanTermQuery spanCavy = new SpanTermQuery(new Term("field", "cavy"));
|
||||
SpanOrQuery spanPhrase = new SpanOrQuery(new SpanQuery[]{spanGuineaPig, spanCavy});
|
||||
assertEquals(spanPhrase, dumb.parse("\"guinea pig\""));
|
||||
|
||||
// custom behavior, the synonyms are expanded, unless you use quote operator
|
||||
QueryParser smart = new SmartQueryParser();
|
||||
smart.setSplitOnWhitespace(false);
|
||||
graphQuery = new GraphQuery(guineaPig, cavy);
|
||||
assertEquals(graphQuery, smart.parse("guinea pig"));
|
||||
assertEquals(guineaPigPhrase, smart.parse("\"guinea pig\""));
|
||||
assertEquals(phraseGuineaPig, smart.parse("\"guinea pig\""));
|
||||
}
|
||||
|
||||
public void testEnableGraphQueries() throws Exception {
|
||||
|
@ -626,9 +636,9 @@ public class TestQueryParser extends QueryParserTestBase {
|
|||
assertQueryEquals("guinea pig running?", a, "Graph(+field:guinea +field:pig, field:cavy, hasBoolean=true, hasPhrase=false) running?");
|
||||
assertQueryEquals("guinea pig \"running\"", a, "Graph(+field:guinea +field:pig, field:cavy, hasBoolean=true, hasPhrase=false) running");
|
||||
|
||||
assertQueryEquals("\"guinea pig\"~2", a, "Graph(field:\"guinea pig\"~2, field:cavy, hasBoolean=false, hasPhrase=true)");
|
||||
assertQueryEquals("\"guinea pig\"~2", a, "spanOr([spanNear([guinea, pig], 0, true), cavy])");
|
||||
|
||||
assertQueryEquals("field:\"guinea pig\"", a, "Graph(field:\"guinea pig\", field:cavy, hasBoolean=false, hasPhrase=true)");
|
||||
assertQueryEquals("field:\"guinea pig\"", a, "spanOr([spanNear([guinea, pig], 0, true), cavy])");
|
||||
|
||||
splitOnWhitespace = oldSplitOnWhitespace;
|
||||
}
|
||||
|
@ -705,9 +715,9 @@ public class TestQueryParser extends QueryParserTestBase {
|
|||
assertQueryEquals("guinea pig running?", a, "guinea pig running?");
|
||||
assertQueryEquals("guinea pig \"running\"", a, "guinea pig running");
|
||||
|
||||
assertQueryEquals("\"guinea pig\"~2", a, "Graph(field:\"guinea pig\"~2, field:cavy, hasBoolean=false, hasPhrase=true)");
|
||||
assertQueryEquals("\"guinea pig\"~2", a, "spanOr([spanNear([guinea, pig], 0, true), cavy])");
|
||||
|
||||
assertQueryEquals("field:\"guinea pig\"", a, "Graph(field:\"guinea pig\", field:cavy, hasBoolean=false, hasPhrase=true)");
|
||||
assertQueryEquals("field:\"guinea pig\"", a, "spanOr([spanNear([guinea, pig], 0, true), cavy])");
|
||||
|
||||
splitOnWhitespace = oldSplitOnWhitespace;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue