mirror of https://github.com/apache/lucene.git
LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery, PhraseQuery or MultiPhraseQuery when the word automaton is simple
This commit is contained in:
parent
284eb77ece
commit
cc99815dcb
|
@ -57,6 +57,12 @@ Other
|
|||
======================= Lucene 6.4.0 =======================
|
||||
(No Changes)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery,
|
||||
PhraseQuery or MultiPhraseQuery when the word automaton is simple
|
||||
(Mike McCandless)
|
||||
|
||||
======================= Lucene 6.3.0 =======================
|
||||
|
||||
API Changes
|
||||
|
|
|
@ -265,7 +265,7 @@ public class PhraseQuery extends Query {
|
|||
* Returns the relative positions of terms in this phrase.
|
||||
*/
|
||||
public int[] getPositions() {
|
||||
return positions;
|
||||
return positions;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -23,9 +23,10 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReaderContext;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
|
@ -34,6 +35,7 @@ import org.apache.lucene.index.TermsEnum;
|
|||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
|
@ -183,6 +185,10 @@ public class TermAutomatonQuery extends Query {
|
|||
|
||||
det = Operations.removeDeadStates(Operations.determinize(automaton,
|
||||
maxDeterminizedStates));
|
||||
|
||||
if (det.isAccept(0)) {
|
||||
throw new IllegalStateException("cannot accept the empty string");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -396,4 +402,82 @@ public class TermAutomatonQuery extends Query {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
if (Operations.isEmpty(det)) {
|
||||
return new MatchNoDocsQuery();
|
||||
}
|
||||
|
||||
IntsRef single = Operations.getSingleton(det);
|
||||
if (single != null && single.length == 1) {
|
||||
return new TermQuery(new Term(field, idToTerm.get(single.ints[single.offset])));
|
||||
}
|
||||
|
||||
// TODO: can PhraseQuery really handle multiple terms at the same position? If so, why do we even have MultiPhraseQuery?
|
||||
|
||||
// Try for either PhraseQuery or MultiPhraseQuery, which only works when the automaton is a sausage:
|
||||
MultiPhraseQuery.Builder mpq = new MultiPhraseQuery.Builder();
|
||||
PhraseQuery.Builder pq = new PhraseQuery.Builder();
|
||||
|
||||
Transition t = new Transition();
|
||||
int state = 0;
|
||||
int pos = 0;
|
||||
query:
|
||||
while (true) {
|
||||
int count = det.initTransition(state, t);
|
||||
if (count == 0) {
|
||||
if (det.isAccept(state) == false) {
|
||||
mpq = null;
|
||||
pq = null;
|
||||
}
|
||||
break;
|
||||
} else if (det.isAccept(state)) {
|
||||
mpq = null;
|
||||
pq = null;
|
||||
break;
|
||||
}
|
||||
int dest = -1;
|
||||
List<Term> terms = new ArrayList<>();
|
||||
boolean matchesAny = false;
|
||||
for(int i=0;i<count;i++) {
|
||||
det.getNextTransition(t);
|
||||
if (i == 0) {
|
||||
dest = t.dest;
|
||||
} else if (dest != t.dest) {
|
||||
mpq = null;
|
||||
pq = null;
|
||||
break query;
|
||||
}
|
||||
|
||||
matchesAny |= anyTermID >= t.min && anyTermID <= t.max;
|
||||
|
||||
if (matchesAny == false) {
|
||||
for(int termID=t.min;termID<=t.max;termID++) {
|
||||
terms.add(new Term(field, idToTerm.get(termID)));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (matchesAny == false) {
|
||||
mpq.add(terms.toArray(new Term[terms.size()]), pos);
|
||||
if (pq != null) {
|
||||
if (terms.size() == 1) {
|
||||
pq.add(terms.get(0), pos);
|
||||
} else {
|
||||
pq = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
state = dest;
|
||||
pos++;
|
||||
}
|
||||
|
||||
if (pq != null) {
|
||||
return pq.build();
|
||||
} else if (mpq != null) {
|
||||
return mpq.build();
|
||||
}
|
||||
|
||||
// TODO: we could maybe also rewrite to union of PhraseQuery (pull all finite strings) if it's "worth it"?
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -296,7 +296,6 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
|
|||
while (scorer instanceof AssertingScorer) {
|
||||
scorer = ((AssertingScorer) scorer).getIn();
|
||||
}
|
||||
assert scorer instanceof TermAutomatonScorer;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -683,7 +682,7 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
|
|||
w.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(newTextField("field", "comes here", Field.Store.NO));
|
||||
doc.add(newTextField("field", "comes foo", Field.Store.NO));
|
||||
w.addDocument(doc);
|
||||
IndexReader r = w.getReader();
|
||||
IndexSearcher s = newSearcher(r);
|
||||
|
@ -691,9 +690,11 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
|
|||
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||
int init = q.createState();
|
||||
int s1 = q.createState();
|
||||
int s2 = q.createState();
|
||||
q.addTransition(init, s1, "here");
|
||||
q.addTransition(s1, init, "comes");
|
||||
q.setAccept(init, true);
|
||||
q.addTransition(s1, s2, "comes");
|
||||
q.addTransition(s2, s1, "here");
|
||||
q.setAccept(s1, true);
|
||||
q.finish();
|
||||
|
||||
assertEquals(1, s.search(q, 1).totalHits);
|
||||
|
@ -779,8 +780,186 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
|
|||
// System.out.println("DOT: " + q.toDot());
|
||||
assertEquals(0, s.search(q, 1).totalHits);
|
||||
|
||||
w.close();
|
||||
r.close();
|
||||
dir.close();
|
||||
IOUtils.close(w, r, dir);
|
||||
}
|
||||
|
||||
public void testEmptyString() throws Exception {
|
||||
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||
int initState = q.createState();
|
||||
q.setAccept(initState, true);
|
||||
try {
|
||||
q.finish();
|
||||
fail("did not hit exc");
|
||||
} catch (IllegalStateException ise) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
|
||||
public void testRewriteNoMatch() throws Exception {
|
||||
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||
int initState = q.createState();
|
||||
q.finish();
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("field", "x y z", Field.Store.NO));
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexReader r = w.getReader();
|
||||
assertTrue(q.rewrite(r) instanceof MatchNoDocsQuery);
|
||||
IOUtils.close(w, r, dir);
|
||||
}
|
||||
|
||||
public void testRewriteTerm() throws Exception {
|
||||
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||
int initState = q.createState();
|
||||
int s1 = q.createState();
|
||||
q.addTransition(initState, s1, "foo");
|
||||
q.setAccept(s1, true);
|
||||
q.finish();
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("field", "x y z", Field.Store.NO));
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexReader r = w.getReader();
|
||||
Query rewrite = q.rewrite(r);
|
||||
assertTrue(rewrite instanceof TermQuery);
|
||||
assertEquals(new Term("field", "foo"), ((TermQuery) rewrite).getTerm());
|
||||
IOUtils.close(w, r, dir);
|
||||
}
|
||||
|
||||
public void testRewriteSimplePhrase() throws Exception {
|
||||
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||
int initState = q.createState();
|
||||
int s1 = q.createState();
|
||||
int s2 = q.createState();
|
||||
q.addTransition(initState, s1, "foo");
|
||||
q.addTransition(s1, s2, "bar");
|
||||
q.setAccept(s2, true);
|
||||
q.finish();
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("field", "x y z", Field.Store.NO));
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexReader r = w.getReader();
|
||||
Query rewrite = q.rewrite(r);
|
||||
assertTrue(rewrite instanceof PhraseQuery);
|
||||
Term[] terms = ((PhraseQuery) rewrite).getTerms();
|
||||
assertEquals(new Term("field", "foo"), terms[0]);
|
||||
assertEquals(new Term("field", "bar"), terms[1]);
|
||||
|
||||
int[] positions = ((PhraseQuery) rewrite).getPositions();
|
||||
assertEquals(0, positions[0]);
|
||||
assertEquals(1, positions[1]);
|
||||
|
||||
IOUtils.close(w, r, dir);
|
||||
}
|
||||
|
||||
public void testRewritePhraseWithAny() throws Exception {
|
||||
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||
int initState = q.createState();
|
||||
int s1 = q.createState();
|
||||
int s2 = q.createState();
|
||||
int s3 = q.createState();
|
||||
q.addTransition(initState, s1, "foo");
|
||||
q.addAnyTransition(s1, s2);
|
||||
q.addTransition(s2, s3, "bar");
|
||||
q.setAccept(s3, true);
|
||||
q.finish();
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("field", "x y z", Field.Store.NO));
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexReader r = w.getReader();
|
||||
Query rewrite = q.rewrite(r);
|
||||
assertTrue(rewrite instanceof PhraseQuery);
|
||||
Term[] terms = ((PhraseQuery) rewrite).getTerms();
|
||||
assertEquals(new Term("field", "foo"), terms[0]);
|
||||
assertEquals(new Term("field", "bar"), terms[1]);
|
||||
|
||||
int[] positions = ((PhraseQuery) rewrite).getPositions();
|
||||
assertEquals(0, positions[0]);
|
||||
assertEquals(2, positions[1]);
|
||||
|
||||
IOUtils.close(w, r, dir);
|
||||
}
|
||||
|
||||
public void testRewriteSimpleMultiPhrase() throws Exception {
|
||||
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||
int initState = q.createState();
|
||||
int s1 = q.createState();
|
||||
q.addTransition(initState, s1, "foo");
|
||||
q.addTransition(initState, s1, "bar");
|
||||
q.setAccept(s1, true);
|
||||
q.finish();
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("field", "x y z", Field.Store.NO));
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexReader r = w.getReader();
|
||||
Query rewrite = q.rewrite(r);
|
||||
assertTrue(rewrite instanceof MultiPhraseQuery);
|
||||
Term[][] terms = ((MultiPhraseQuery) rewrite).getTermArrays();
|
||||
assertEquals(1, terms.length);
|
||||
assertEquals(2, terms[0].length);
|
||||
assertEquals(new Term("field", "foo"), terms[0][0]);
|
||||
assertEquals(new Term("field", "bar"), terms[0][1]);
|
||||
|
||||
int[] positions = ((MultiPhraseQuery) rewrite).getPositions();
|
||||
assertEquals(1, positions.length);
|
||||
assertEquals(0, positions[0]);
|
||||
|
||||
IOUtils.close(w, r, dir);
|
||||
}
|
||||
|
||||
public void testRewriteMultiPhraseWithAny() throws Exception {
|
||||
TermAutomatonQuery q = new TermAutomatonQuery("field");
|
||||
int initState = q.createState();
|
||||
int s1 = q.createState();
|
||||
int s2 = q.createState();
|
||||
int s3 = q.createState();
|
||||
q.addTransition(initState, s1, "foo");
|
||||
q.addTransition(initState, s1, "bar");
|
||||
q.addAnyTransition(s1, s2);
|
||||
q.addTransition(s2, s3, "baz");
|
||||
q.setAccept(s3, true);
|
||||
q.finish();
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("field", "x y z", Field.Store.NO));
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexReader r = w.getReader();
|
||||
Query rewrite = q.rewrite(r);
|
||||
assertTrue(rewrite instanceof MultiPhraseQuery);
|
||||
Term[][] terms = ((MultiPhraseQuery) rewrite).getTermArrays();
|
||||
assertEquals(2, terms.length);
|
||||
assertEquals(2, terms[0].length);
|
||||
assertEquals(new Term("field", "foo"), terms[0][0]);
|
||||
assertEquals(new Term("field", "bar"), terms[0][1]);
|
||||
assertEquals(1, terms[1].length);
|
||||
assertEquals(new Term("field", "baz"), terms[1][0]);
|
||||
|
||||
int[] positions = ((MultiPhraseQuery) rewrite).getPositions();
|
||||
assertEquals(2, positions.length);
|
||||
assertEquals(0, positions[0]);
|
||||
assertEquals(2, positions[1]);
|
||||
|
||||
IOUtils.close(w, r, dir);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue