Optimize TermInSetQuery for terms that match all docs in a segment (#1062)

This commit is contained in:
Greg Miller 2022-08-23 08:37:44 -07:00 committed by GitHub
parent 8021c2db4e
commit 1529606763
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 76 additions and 0 deletions

View File

@ -110,6 +110,9 @@ Optimizations
* LUCENE-10627: Using ByteBuffersDataInput reduce memory copy on compressing data. (luyuncheng)
* GITHUB#1062: Optimize TermInSetQuery when a term is present that matches all docs in a segment.
(Greg Miller)
Bug Fixes
---------------------
* LUCENE-10663: Fix KnnVectorQuery explain with multiple segments. (Shiming Li)

View File

@ -48,6 +48,26 @@ public abstract class DocIdSet implements Accountable {
}
};
/** A {@code DocIdSet} that matches all doc ids up to a specified doc (exclusive). */
public static DocIdSet all(int maxDoc) {
return new DocIdSet() {
@Override
public DocIdSetIterator iterator() throws IOException {
return DocIdSetIterator.all(maxDoc);
}
@Override
public Bits bits() throws IOException {
return new Bits.MatchAllBits(maxDoc);
}
@Override
public long ramBytesUsed() {
return Integer.BYTES;
}
};
}
/**
* Provides a {@link DocIdSetIterator} to access the set. This implementation can return <code>
* null</code> if there are no docs that match.

View File

@ -282,6 +282,9 @@ public class TermInSetQuery extends Query implements Accountable {
assert field.equals(iterator.field());
if (termsEnum.seekExact(term)) {
if (matchingTerms == null) {
if (reader.maxDoc() == termsEnum.docFreq()) {
return new WeightOrDocIdSet(DocIdSet.all(reader.maxDoc()));
}
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
} else if (matchingTerms.size() < threshold) {
@ -289,10 +292,16 @@ public class TermInSetQuery extends Query implements Accountable {
} else {
assert matchingTerms.size() == threshold;
builder = new DocIdSetBuilder(reader.maxDoc(), terms);
if (reader.maxDoc() == termsEnum.docFreq()) {
return new WeightOrDocIdSet(DocIdSet.all(reader.maxDoc()));
}
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
for (TermAndState t : matchingTerms) {
t.termsEnum.seekExact(t.term, t.state);
if (reader.maxDoc() == t.docFreq) {
return new WeightOrDocIdSet(DocIdSet.all(reader.maxDoc()));
}
docs = t.termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
}

View File

@ -19,12 +19,14 @@ package org.apache.lucene.search;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
@ -49,6 +51,48 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
public class TestTermInSetQuery extends LuceneTestCase {
public void testAllDocsTerm() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
String field = "f";
BytesRef denseTerm = new BytesRef(TestUtil.randomAnalysisString(random(), 10, true));
Set<BytesRef> randomTerms = new HashSet<>();
while (randomTerms.size() < TermInSetQuery.BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD) {
randomTerms.add(new BytesRef(TestUtil.randomAnalysisString(random(), 10, true)));
}
assert randomTerms.size() == TermInSetQuery.BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD;
BytesRef[] otherTerms = new BytesRef[randomTerms.size()];
int idx = 0;
for (BytesRef term : randomTerms) {
otherTerms[idx++] = term;
}
int numDocs = 10 * otherTerms.length;
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(new StringField(field, denseTerm, Store.NO));
BytesRef sparseTerm = otherTerms[i % otherTerms.length];
doc.add(new StringField(field, sparseTerm, Store.NO));
iw.addDocument(doc);
}
IndexReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();
List<BytesRef> queryTerms = Arrays.stream(otherTerms).collect(Collectors.toList());
queryTerms.add(denseTerm);
TermInSetQuery query = new TermInSetQuery(field, queryTerms);
TopDocs topDocs = searcher.search(query, numDocs);
assertEquals(numDocs, topDocs.totalHits.value);
reader.close();
dir.close();
}
public void testDuel() throws IOException {
final int iters = atLeast(2);
final String field = "f";