mirror of https://github.com/apache/lucene.git
LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery, PhraseQuery or MultiPhraseQuery when the word automaton is simple
This commit is contained in:
parent
284eb77ece
commit
cc99815dcb
|
@ -57,6 +57,12 @@ Other
|
||||||
======================= Lucene 6.4.0 =======================
|
======================= Lucene 6.4.0 =======================
|
||||||
(No Changes)
|
(No Changes)
|
||||||
|
|
||||||
|
Improvements
|
||||||
|
|
||||||
|
* LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery,
|
||||||
|
PhraseQuery or MultiPhraseQuery when the word automaton is simple
|
||||||
|
(Mike McCandless)
|
||||||
|
|
||||||
======================= Lucene 6.3.0 =======================
|
======================= Lucene 6.3.0 =======================
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
|
@ -265,7 +265,7 @@ public class PhraseQuery extends Query {
|
||||||
* Returns the relative positions of terms in this phrase.
|
* Returns the relative positions of terms in this phrase.
|
||||||
*/
|
*/
|
||||||
public int[] getPositions() {
|
public int[] getPositions() {
|
||||||
return positions;
|
return positions;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -23,9 +23,10 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexReaderContext;
|
import org.apache.lucene.index.IndexReaderContext;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
import org.apache.lucene.index.ReaderUtil;
|
import org.apache.lucene.index.ReaderUtil;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.TermContext;
|
import org.apache.lucene.index.TermContext;
|
||||||
|
@ -34,6 +35,7 @@ import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.search.similarities.Similarity;
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.automaton.Automaton;
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
import org.apache.lucene.util.automaton.Operations;
|
import org.apache.lucene.util.automaton.Operations;
|
||||||
import org.apache.lucene.util.automaton.Transition;
|
import org.apache.lucene.util.automaton.Transition;
|
||||||
|
@ -183,6 +185,10 @@ public class TermAutomatonQuery extends Query {
|
||||||
|
|
||||||
det = Operations.removeDeadStates(Operations.determinize(automaton,
|
det = Operations.removeDeadStates(Operations.determinize(automaton,
|
||||||
maxDeterminizedStates));
|
maxDeterminizedStates));
|
||||||
|
|
||||||
|
if (det.isAccept(0)) {
|
||||||
|
throw new IllegalStateException("cannot accept the empty string");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -396,4 +402,82 @@ public class TermAutomatonQuery extends Query {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Query rewrite(IndexReader reader) throws IOException {
|
||||||
|
if (Operations.isEmpty(det)) {
|
||||||
|
return new MatchNoDocsQuery();
|
||||||
|
}
|
||||||
|
|
||||||
|
IntsRef single = Operations.getSingleton(det);
|
||||||
|
if (single != null && single.length == 1) {
|
||||||
|
return new TermQuery(new Term(field, idToTerm.get(single.ints[single.offset])));
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: can PhraseQuery really handle multiple terms at the same position? If so, why do we even have MultiPhraseQuery?
|
||||||
|
|
||||||
|
// Try for either PhraseQuery or MultiPhraseQuery, which only works when the automaton is a sausage:
|
||||||
|
MultiPhraseQuery.Builder mpq = new MultiPhraseQuery.Builder();
|
||||||
|
PhraseQuery.Builder pq = new PhraseQuery.Builder();
|
||||||
|
|
||||||
|
Transition t = new Transition();
|
||||||
|
int state = 0;
|
||||||
|
int pos = 0;
|
||||||
|
query:
|
||||||
|
while (true) {
|
||||||
|
int count = det.initTransition(state, t);
|
||||||
|
if (count == 0) {
|
||||||
|
if (det.isAccept(state) == false) {
|
||||||
|
mpq = null;
|
||||||
|
pq = null;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
} else if (det.isAccept(state)) {
|
||||||
|
mpq = null;
|
||||||
|
pq = null;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
int dest = -1;
|
||||||
|
List<Term> terms = new ArrayList<>();
|
||||||
|
boolean matchesAny = false;
|
||||||
|
for(int i=0;i<count;i++) {
|
||||||
|
det.getNextTransition(t);
|
||||||
|
if (i == 0) {
|
||||||
|
dest = t.dest;
|
||||||
|
} else if (dest != t.dest) {
|
||||||
|
mpq = null;
|
||||||
|
pq = null;
|
||||||
|
break query;
|
||||||
|
}
|
||||||
|
|
||||||
|
matchesAny |= anyTermID >= t.min && anyTermID <= t.max;
|
||||||
|
|
||||||
|
if (matchesAny == false) {
|
||||||
|
for(int termID=t.min;termID<=t.max;termID++) {
|
||||||
|
terms.add(new Term(field, idToTerm.get(termID)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (matchesAny == false) {
|
||||||
|
mpq.add(terms.toArray(new Term[terms.size()]), pos);
|
||||||
|
if (pq != null) {
|
||||||
|
if (terms.size() == 1) {
|
||||||
|
pq.add(terms.get(0), pos);
|
||||||
|
} else {
|
||||||
|
pq = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
state = dest;
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pq != null) {
|
||||||
|
return pq.build();
|
||||||
|
} else if (mpq != null) {
|
||||||
|
return mpq.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: we could maybe also rewrite to union of PhraseQuery (pull all finite strings) if it's "worth it"?
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -296,7 +296,6 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
|
||||||
while (scorer instanceof AssertingScorer) {
|
while (scorer instanceof AssertingScorer) {
|
||||||
scorer = ((AssertingScorer) scorer).getIn();
|
scorer = ((AssertingScorer) scorer).getIn();
|
||||||
}
|
}
|
||||||
assert scorer instanceof TermAutomatonScorer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -683,7 +682,7 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
|
||||||
w.addDocument(doc);
|
w.addDocument(doc);
|
||||||
|
|
||||||
doc = new Document();
|
doc = new Document();
|
||||||
doc.add(newTextField("field", "comes here", Field.Store.NO));
|
doc.add(newTextField("field", "comes foo", Field.Store.NO));
|
||||||
w.addDocument(doc);
|
w.addDocument(doc);
|
||||||
IndexReader r = w.getReader();
|
IndexReader r = w.getReader();
|
||||||
IndexSearcher s = newSearcher(r);
|
IndexSearcher s = newSearcher(r);
|
||||||
|
@ -691,9 +690,11 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
|
||||||
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||||
int init = q.createState();
|
int init = q.createState();
|
||||||
int s1 = q.createState();
|
int s1 = q.createState();
|
||||||
|
int s2 = q.createState();
|
||||||
q.addTransition(init, s1, "here");
|
q.addTransition(init, s1, "here");
|
||||||
q.addTransition(s1, init, "comes");
|
q.addTransition(s1, s2, "comes");
|
||||||
q.setAccept(init, true);
|
q.addTransition(s2, s1, "here");
|
||||||
|
q.setAccept(s1, true);
|
||||||
q.finish();
|
q.finish();
|
||||||
|
|
||||||
assertEquals(1, s.search(q, 1).totalHits);
|
assertEquals(1, s.search(q, 1).totalHits);
|
||||||
|
@ -779,8 +780,186 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
|
||||||
// System.out.println("DOT: " + q.toDot());
|
// System.out.println("DOT: " + q.toDot());
|
||||||
assertEquals(0, s.search(q, 1).totalHits);
|
assertEquals(0, s.search(q, 1).totalHits);
|
||||||
|
|
||||||
w.close();
|
IOUtils.close(w, r, dir);
|
||||||
r.close();
|
}
|
||||||
dir.close();
|
|
||||||
|
public void testEmptyString() throws Exception {
|
||||||
|
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||||
|
int initState = q.createState();
|
||||||
|
q.setAccept(initState, true);
|
||||||
|
try {
|
||||||
|
q.finish();
|
||||||
|
fail("did not hit exc");
|
||||||
|
} catch (IllegalStateException ise) {
|
||||||
|
// expected
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRewriteNoMatch() throws Exception {
|
||||||
|
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||||
|
int initState = q.createState();
|
||||||
|
q.finish();
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newTextField("field", "x y z", Field.Store.NO));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader r = w.getReader();
|
||||||
|
assertTrue(q.rewrite(r) instanceof MatchNoDocsQuery);
|
||||||
|
IOUtils.close(w, r, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRewriteTerm() throws Exception {
|
||||||
|
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||||
|
int initState = q.createState();
|
||||||
|
int s1 = q.createState();
|
||||||
|
q.addTransition(initState, s1, "foo");
|
||||||
|
q.setAccept(s1, true);
|
||||||
|
q.finish();
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newTextField("field", "x y z", Field.Store.NO));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader r = w.getReader();
|
||||||
|
Query rewrite = q.rewrite(r);
|
||||||
|
assertTrue(rewrite instanceof TermQuery);
|
||||||
|
assertEquals(new Term("field", "foo"), ((TermQuery) rewrite).getTerm());
|
||||||
|
IOUtils.close(w, r, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRewriteSimplePhrase() throws Exception {
|
||||||
|
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||||
|
int initState = q.createState();
|
||||||
|
int s1 = q.createState();
|
||||||
|
int s2 = q.createState();
|
||||||
|
q.addTransition(initState, s1, "foo");
|
||||||
|
q.addTransition(s1, s2, "bar");
|
||||||
|
q.setAccept(s2, true);
|
||||||
|
q.finish();
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newTextField("field", "x y z", Field.Store.NO));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader r = w.getReader();
|
||||||
|
Query rewrite = q.rewrite(r);
|
||||||
|
assertTrue(rewrite instanceof PhraseQuery);
|
||||||
|
Term[] terms = ((PhraseQuery) rewrite).getTerms();
|
||||||
|
assertEquals(new Term("field", "foo"), terms[0]);
|
||||||
|
assertEquals(new Term("field", "bar"), terms[1]);
|
||||||
|
|
||||||
|
int[] positions = ((PhraseQuery) rewrite).getPositions();
|
||||||
|
assertEquals(0, positions[0]);
|
||||||
|
assertEquals(1, positions[1]);
|
||||||
|
|
||||||
|
IOUtils.close(w, r, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRewritePhraseWithAny() throws Exception {
|
||||||
|
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||||
|
int initState = q.createState();
|
||||||
|
int s1 = q.createState();
|
||||||
|
int s2 = q.createState();
|
||||||
|
int s3 = q.createState();
|
||||||
|
q.addTransition(initState, s1, "foo");
|
||||||
|
q.addAnyTransition(s1, s2);
|
||||||
|
q.addTransition(s2, s3, "bar");
|
||||||
|
q.setAccept(s3, true);
|
||||||
|
q.finish();
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newTextField("field", "x y z", Field.Store.NO));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader r = w.getReader();
|
||||||
|
Query rewrite = q.rewrite(r);
|
||||||
|
assertTrue(rewrite instanceof PhraseQuery);
|
||||||
|
Term[] terms = ((PhraseQuery) rewrite).getTerms();
|
||||||
|
assertEquals(new Term("field", "foo"), terms[0]);
|
||||||
|
assertEquals(new Term("field", "bar"), terms[1]);
|
||||||
|
|
||||||
|
int[] positions = ((PhraseQuery) rewrite).getPositions();
|
||||||
|
assertEquals(0, positions[0]);
|
||||||
|
assertEquals(2, positions[1]);
|
||||||
|
|
||||||
|
IOUtils.close(w, r, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRewriteSimpleMultiPhrase() throws Exception {
|
||||||
|
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||||
|
int initState = q.createState();
|
||||||
|
int s1 = q.createState();
|
||||||
|
q.addTransition(initState, s1, "foo");
|
||||||
|
q.addTransition(initState, s1, "bar");
|
||||||
|
q.setAccept(s1, true);
|
||||||
|
q.finish();
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newTextField("field", "x y z", Field.Store.NO));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader r = w.getReader();
|
||||||
|
Query rewrite = q.rewrite(r);
|
||||||
|
assertTrue(rewrite instanceof MultiPhraseQuery);
|
||||||
|
Term[][] terms = ((MultiPhraseQuery) rewrite).getTermArrays();
|
||||||
|
assertEquals(1, terms.length);
|
||||||
|
assertEquals(2, terms[0].length);
|
||||||
|
assertEquals(new Term("field", "foo"), terms[0][0]);
|
||||||
|
assertEquals(new Term("field", "bar"), terms[0][1]);
|
||||||
|
|
||||||
|
int[] positions = ((MultiPhraseQuery) rewrite).getPositions();
|
||||||
|
assertEquals(1, positions.length);
|
||||||
|
assertEquals(0, positions[0]);
|
||||||
|
|
||||||
|
IOUtils.close(w, r, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRewriteMultiPhraseWithAny() throws Exception {
|
||||||
|
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||||
|
int initState = q.createState();
|
||||||
|
int s1 = q.createState();
|
||||||
|
int s2 = q.createState();
|
||||||
|
int s3 = q.createState();
|
||||||
|
q.addTransition(initState, s1, "foo");
|
||||||
|
q.addTransition(initState, s1, "bar");
|
||||||
|
q.addAnyTransition(s1, s2);
|
||||||
|
q.addTransition(s2, s3, "baz");
|
||||||
|
q.setAccept(s3, true);
|
||||||
|
q.finish();
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newTextField("field", "x y z", Field.Store.NO));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader r = w.getReader();
|
||||||
|
Query rewrite = q.rewrite(r);
|
||||||
|
assertTrue(rewrite instanceof MultiPhraseQuery);
|
||||||
|
Term[][] terms = ((MultiPhraseQuery) rewrite).getTermArrays();
|
||||||
|
assertEquals(2, terms.length);
|
||||||
|
assertEquals(2, terms[0].length);
|
||||||
|
assertEquals(new Term("field", "foo"), terms[0][0]);
|
||||||
|
assertEquals(new Term("field", "bar"), terms[0][1]);
|
||||||
|
assertEquals(1, terms[1].length);
|
||||||
|
assertEquals(new Term("field", "baz"), terms[1][0]);
|
||||||
|
|
||||||
|
int[] positions = ((MultiPhraseQuery) rewrite).getPositions();
|
||||||
|
assertEquals(2, positions.length);
|
||||||
|
assertEquals(0, positions[0]);
|
||||||
|
assertEquals(2, positions[1]);
|
||||||
|
|
||||||
|
IOUtils.close(w, r, dir);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue