LUCENE-8941: Build wildcard matches lazily

This commit is contained in:
Alan Woodward 2019-08-05 09:44:44 +01:00
parent 64884be044
commit fa72da1c71
3 changed files with 148 additions and 5 deletions

View File

@ -91,6 +91,9 @@ Optimizations
* LUCENE-8935: BooleanQuery with no scoring clause can now early terminate the query when * LUCENE-8935: BooleanQuery with no scoring clause can now early terminate the query when
the total hits is not requested. the total hits is not requested.
* LUCENE-8941: Matches on wildcard queries will defer building their full
disjunction until a MatchesIterator is pulled (Alan Woodward)
Other Other
* LUCENE-8778 LUCENE-8911: Define analyzer SPI names as static final fields and document the names in Javadocs. * LUCENE-8778 LUCENE-8911: Define analyzer SPI names as static final fields and document the names in Javadocs.

View File

@ -74,7 +74,6 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
*/ */
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException { static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
Objects.requireNonNull(field); Objects.requireNonNull(field);
List<MatchesIterator> mis = new ArrayList<>();
Terms t = context.reader().terms(field); Terms t = context.reader().terms(field);
if (t == null) if (t == null)
return null; return null;
@ -84,15 +83,92 @@ final class DisjunctionMatchesIterator implements MatchesIterator {
if (te.seekExact(term)) { if (te.seekExact(term)) {
PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS); PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
if (pe.advance(doc) == doc) { if (pe.advance(doc) == doc) {
mis.add(new TermMatchesIterator(query, pe)); return new TermsEnumDisjunctionMatchesIterator(new TermMatchesIterator(query, pe), terms, te, doc, query);
reuse = null;
} }
else { else {
reuse = pe; reuse = pe;
} }
} }
} }
return fromSubIterators(mis); return null;
}
// MatchesIterator over a set of terms that only loads the first matching term at construction,
// waiting until the iterator is actually used before it loads all other matching terms.
private static class TermsEnumDisjunctionMatchesIterator implements MatchesIterator {
private final MatchesIterator first;
private final BytesRefIterator terms;
private final TermsEnum te;
private final int doc;
private final Query query;
private MatchesIterator it = null;
TermsEnumDisjunctionMatchesIterator(MatchesIterator first, BytesRefIterator terms, TermsEnum te, int doc, Query query) {
this.first = first;
this.terms = terms;
this.te = te;
this.doc = doc;
this.query = query;
}
private void init() throws IOException {
List<MatchesIterator> mis = new ArrayList<>();
mis.add(first);
PostingsEnum reuse = null;
for (BytesRef term = terms.next(); term != null; term = terms.next()) {
if (te.seekExact(term)) {
PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
if (pe.advance(doc) == doc) {
mis.add(new TermMatchesIterator(query, pe));
reuse = null;
} else {
reuse = pe;
}
}
}
it = fromSubIterators(mis);
}
@Override
public boolean next() throws IOException {
if (it == null) {
init();
}
assert it != null;
return it.next();
}
@Override
public int startPosition() {
return it.startPosition();
}
@Override
public int endPosition() {
return it.endPosition();
}
@Override
public int startOffset() throws IOException {
return it.startOffset();
}
@Override
public int endOffset() throws IOException {
return it.endOffset();
}
@Override
public MatchesIterator getSubMatches() throws IOException {
return it.getSubMatches();
}
@Override
public Query getQuery() {
return it.getQuery();
}
} }
static MatchesIterator fromSubIterators(List<MatchesIterator> mis) throws IOException { static MatchesIterator fromSubIterators(List<MatchesIterator> mis) throws IOException {

View File

@ -31,25 +31,30 @@ import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntPoint; import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.TextField; import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
public class TestMatchesIterator extends LuceneTestCase { public class TestMatchesIterator extends LuceneTestCase {
protected IndexSearcher searcher; protected IndexSearcher searcher;
protected Directory directory; protected Directory directory;
protected IndexReader reader; protected IndexReader reader = null;
private static final String FIELD_WITH_OFFSETS = "field_offsets"; private static final String FIELD_WITH_OFFSETS = "field_offsets";
private static final String FIELD_NO_OFFSETS = "field_no_offsets"; private static final String FIELD_NO_OFFSETS = "field_no_offsets";
@ -701,4 +706,63 @@ public class TestMatchesIterator extends LuceneTestCase {
}); });
} }
public void testMinimalSeekingWithWildcards() throws IOException {
SeekCountingLeafReader reader = new SeekCountingLeafReader(getOnlyLeafReader(this.reader));
this.searcher = new IndexSearcher(reader);
Query query = new PrefixQuery(new Term(FIELD_WITH_OFFSETS, "w"));
Weight w = searcher.createWeight(query.rewrite(reader), ScoreMode.COMPLETE, 1);
// docs 0-3 match several different terms here, but we only seek to the first term and
// then short-cut return; other terms are ignored until we try and iterate over matches
int[] expectedSeeks = new int[]{ 1, 1, 1, 1, 6, 6 };
int i = 0;
for (LeafReaderContext ctx : reader.leaves()) {
for (int doc = 0; doc < ctx.reader().maxDoc(); doc++) {
reader.seeks = 0;
w.matches(ctx, doc);
assertEquals("Unexpected seek count on doc " + doc, expectedSeeks[i], reader.seeks);
i++;
}
}
}
private static class SeekCountingLeafReader extends FilterLeafReader {
int seeks = 0;
public SeekCountingLeafReader(LeafReader in) {
super(in);
}
@Override
public Terms terms(String field) throws IOException {
Terms terms = super.terms(field);
if (terms == null) {
return null;
}
return new FilterTerms(terms) {
@Override
public TermsEnum iterator() throws IOException {
return new FilterTermsEnum(super.iterator()) {
@Override
public boolean seekExact(BytesRef text) throws IOException {
seeks++;
return super.seekExact(text);
}
};
}
};
}
@Override
public CacheHelper getCoreCacheHelper() {
return null;
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
}
} }