LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery, PhraseQuery or MultiPhraseQuery when the word automaton is simple

This commit is contained in:
Mike McCandless 2016-11-07 05:53:26 -05:00
parent 284eb77ece
commit cc99815dcb
4 changed files with 278 additions and 9 deletions

View File

@ -57,6 +57,12 @@ Other
======================= Lucene 6.4.0 ======================= ======================= Lucene 6.4.0 =======================
(No Changes) (No Changes)
Improvements
* LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery,
PhraseQuery or MultiPhraseQuery when the word automaton is simple
(Mike McCandless)
======================= Lucene 6.3.0 ======================= ======================= Lucene 6.3.0 =======================
API Changes API Changes

View File

@ -265,7 +265,7 @@ public class PhraseQuery extends Query {
* Returns the relative positions of terms in this phrase. * Returns the relative positions of terms in this phrase.
*/ */
public int[] getPositions() { public int[] getPositions() {
return positions; return positions;
} }
@Override @Override

View File

@ -23,9 +23,10 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext; import org.apache.lucene.index.TermContext;
@ -34,6 +35,7 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition; import org.apache.lucene.util.automaton.Transition;
@ -183,6 +185,10 @@ public class TermAutomatonQuery extends Query {
det = Operations.removeDeadStates(Operations.determinize(automaton, det = Operations.removeDeadStates(Operations.determinize(automaton,
maxDeterminizedStates)); maxDeterminizedStates));
if (det.isAccept(0)) {
throw new IllegalStateException("cannot accept the empty string");
}
} }
@Override @Override
@ -396,4 +402,82 @@ public class TermAutomatonQuery extends Query {
return null; return null;
} }
} }
public Query rewrite(IndexReader reader) throws IOException {
if (Operations.isEmpty(det)) {
return new MatchNoDocsQuery();
}
IntsRef single = Operations.getSingleton(det);
if (single != null && single.length == 1) {
return new TermQuery(new Term(field, idToTerm.get(single.ints[single.offset])));
}
// TODO: can PhraseQuery really handle multiple terms at the same position? If so, why do we even have MultiPhraseQuery?
// Try for either PhraseQuery or MultiPhraseQuery, which only works when the automaton is a sausage:
MultiPhraseQuery.Builder mpq = new MultiPhraseQuery.Builder();
PhraseQuery.Builder pq = new PhraseQuery.Builder();
Transition t = new Transition();
int state = 0;
int pos = 0;
query:
while (true) {
int count = det.initTransition(state, t);
if (count == 0) {
if (det.isAccept(state) == false) {
mpq = null;
pq = null;
}
break;
} else if (det.isAccept(state)) {
mpq = null;
pq = null;
break;
}
int dest = -1;
List<Term> terms = new ArrayList<>();
boolean matchesAny = false;
for(int i=0;i<count;i++) {
det.getNextTransition(t);
if (i == 0) {
dest = t.dest;
} else if (dest != t.dest) {
mpq = null;
pq = null;
break query;
}
matchesAny |= anyTermID >= t.min && anyTermID <= t.max;
if (matchesAny == false) {
for(int termID=t.min;termID<=t.max;termID++) {
terms.add(new Term(field, idToTerm.get(termID)));
}
}
}
if (matchesAny == false) {
mpq.add(terms.toArray(new Term[terms.size()]), pos);
if (pq != null) {
if (terms.size() == 1) {
pq.add(terms.get(0), pos);
} else {
pq = null;
}
}
}
state = dest;
pos++;
}
if (pq != null) {
return pq.build();
} else if (mpq != null) {
return mpq.build();
}
// TODO: we could maybe also rewrite to union of PhraseQuery (pull all finite strings) if it's "worth it"?
return this;
}
} }

View File

@ -296,7 +296,6 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
while (scorer instanceof AssertingScorer) { while (scorer instanceof AssertingScorer) {
scorer = ((AssertingScorer) scorer).getIn(); scorer = ((AssertingScorer) scorer).getIn();
} }
assert scorer instanceof TermAutomatonScorer;
} }
@Override @Override
@ -683,7 +682,7 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
w.addDocument(doc); w.addDocument(doc);
doc = new Document(); doc = new Document();
doc.add(newTextField("field", "comes here", Field.Store.NO)); doc.add(newTextField("field", "comes foo", Field.Store.NO));
w.addDocument(doc); w.addDocument(doc);
IndexReader r = w.getReader(); IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r); IndexSearcher s = newSearcher(r);
@ -691,9 +690,11 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
TermAutomatonQuery q = new TermAutomatonQuery("field"); TermAutomatonQuery q = new TermAutomatonQuery("field");
int init = q.createState(); int init = q.createState();
int s1 = q.createState(); int s1 = q.createState();
int s2 = q.createState();
q.addTransition(init, s1, "here"); q.addTransition(init, s1, "here");
q.addTransition(s1, init, "comes"); q.addTransition(s1, s2, "comes");
q.setAccept(init, true); q.addTransition(s2, s1, "here");
q.setAccept(s1, true);
q.finish(); q.finish();
assertEquals(1, s.search(q, 1).totalHits); assertEquals(1, s.search(q, 1).totalHits);
@ -779,8 +780,186 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
// System.out.println("DOT: " + q.toDot()); // System.out.println("DOT: " + q.toDot());
assertEquals(0, s.search(q, 1).totalHits); assertEquals(0, s.search(q, 1).totalHits);
w.close(); IOUtils.close(w, r, dir);
r.close(); }
dir.close();
public void testEmptyString() throws Exception {
TermAutomatonQuery q = new TermAutomatonQuery("field");
int initState = q.createState();
q.setAccept(initState, true);
try {
q.finish();
fail("did not hit exc");
} catch (IllegalStateException ise) {
// expected
}
}
public void testRewriteNoMatch() throws Exception {
TermAutomatonQuery q = new TermAutomatonQuery("field");
int initState = q.createState();
q.finish();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("field", "x y z", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
assertTrue(q.rewrite(r) instanceof MatchNoDocsQuery);
IOUtils.close(w, r, dir);
}
public void testRewriteTerm() throws Exception {
TermAutomatonQuery q = new TermAutomatonQuery("field");
int initState = q.createState();
int s1 = q.createState();
q.addTransition(initState, s1, "foo");
q.setAccept(s1, true);
q.finish();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("field", "x y z", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
Query rewrite = q.rewrite(r);
assertTrue(rewrite instanceof TermQuery);
assertEquals(new Term("field", "foo"), ((TermQuery) rewrite).getTerm());
IOUtils.close(w, r, dir);
}
public void testRewriteSimplePhrase() throws Exception {
TermAutomatonQuery q = new TermAutomatonQuery("field");
int initState = q.createState();
int s1 = q.createState();
int s2 = q.createState();
q.addTransition(initState, s1, "foo");
q.addTransition(s1, s2, "bar");
q.setAccept(s2, true);
q.finish();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("field", "x y z", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
Query rewrite = q.rewrite(r);
assertTrue(rewrite instanceof PhraseQuery);
Term[] terms = ((PhraseQuery) rewrite).getTerms();
assertEquals(new Term("field", "foo"), terms[0]);
assertEquals(new Term("field", "bar"), terms[1]);
int[] positions = ((PhraseQuery) rewrite).getPositions();
assertEquals(0, positions[0]);
assertEquals(1, positions[1]);
IOUtils.close(w, r, dir);
}
public void testRewritePhraseWithAny() throws Exception {
TermAutomatonQuery q = new TermAutomatonQuery("field");
int initState = q.createState();
int s1 = q.createState();
int s2 = q.createState();
int s3 = q.createState();
q.addTransition(initState, s1, "foo");
q.addAnyTransition(s1, s2);
q.addTransition(s2, s3, "bar");
q.setAccept(s3, true);
q.finish();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("field", "x y z", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
Query rewrite = q.rewrite(r);
assertTrue(rewrite instanceof PhraseQuery);
Term[] terms = ((PhraseQuery) rewrite).getTerms();
assertEquals(new Term("field", "foo"), terms[0]);
assertEquals(new Term("field", "bar"), terms[1]);
int[] positions = ((PhraseQuery) rewrite).getPositions();
assertEquals(0, positions[0]);
assertEquals(2, positions[1]);
IOUtils.close(w, r, dir);
}
public void testRewriteSimpleMultiPhrase() throws Exception {
TermAutomatonQuery q = new TermAutomatonQuery("field");
int initState = q.createState();
int s1 = q.createState();
q.addTransition(initState, s1, "foo");
q.addTransition(initState, s1, "bar");
q.setAccept(s1, true);
q.finish();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("field", "x y z", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
Query rewrite = q.rewrite(r);
assertTrue(rewrite instanceof MultiPhraseQuery);
Term[][] terms = ((MultiPhraseQuery) rewrite).getTermArrays();
assertEquals(1, terms.length);
assertEquals(2, terms[0].length);
assertEquals(new Term("field", "foo"), terms[0][0]);
assertEquals(new Term("field", "bar"), terms[0][1]);
int[] positions = ((MultiPhraseQuery) rewrite).getPositions();
assertEquals(1, positions.length);
assertEquals(0, positions[0]);
IOUtils.close(w, r, dir);
}
public void testRewriteMultiPhraseWithAny() throws Exception {
TermAutomatonQuery q = new TermAutomatonQuery("field");
int initState = q.createState();
int s1 = q.createState();
int s2 = q.createState();
int s3 = q.createState();
q.addTransition(initState, s1, "foo");
q.addTransition(initState, s1, "bar");
q.addAnyTransition(s1, s2);
q.addTransition(s2, s3, "baz");
q.setAccept(s3, true);
q.finish();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("field", "x y z", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
Query rewrite = q.rewrite(r);
assertTrue(rewrite instanceof MultiPhraseQuery);
Term[][] terms = ((MultiPhraseQuery) rewrite).getTermArrays();
assertEquals(2, terms.length);
assertEquals(2, terms[0].length);
assertEquals(new Term("field", "foo"), terms[0][0]);
assertEquals(new Term("field", "bar"), terms[0][1]);
assertEquals(1, terms[1].length);
assertEquals(new Term("field", "baz"), terms[1][0]);
int[] positions = ((MultiPhraseQuery) rewrite).getPositions();
assertEquals(2, positions.length);
assertEquals(0, positions[0]);
assertEquals(2, positions[1]);
IOUtils.close(w, r, dir);
} }
} }