mirror of https://github.com/apache/lucene.git
Optimize TermInSetQuery for terms that match all docs in a segment (#1062)
This commit is contained in:
parent
8021c2db4e
commit
1529606763
|
@ -110,6 +110,9 @@ Optimizations
|
|||
|
||||
* LUCENE-10627: Using ByteBuffersDataInput reduce memory copy on compressing data. (luyuncheng)
|
||||
|
||||
* GITHUB#1062: Optimize TermInSetQuery when a term is present that matches all docs in a segment.
|
||||
(Greg Miller)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
* LUCENE-10663: Fix KnnVectorQuery explain with multiple segments. (Shiming Li)
|
||||
|
|
|
@ -48,6 +48,26 @@ public abstract class DocIdSet implements Accountable {
|
|||
}
|
||||
};
|
||||
|
||||
/** A {@code DocIdSet} that matches all doc ids up to a specified doc (exclusive). */
|
||||
public static DocIdSet all(int maxDoc) {
|
||||
return new DocIdSet() {
|
||||
@Override
|
||||
public DocIdSetIterator iterator() throws IOException {
|
||||
return DocIdSetIterator.all(maxDoc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Bits bits() throws IOException {
|
||||
return new Bits.MatchAllBits(maxDoc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return Integer.BYTES;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides a {@link DocIdSetIterator} to access the set. This implementation can return <code>
|
||||
* null</code> if there are no docs that match.
|
||||
|
|
|
@ -282,6 +282,9 @@ public class TermInSetQuery extends Query implements Accountable {
|
|||
assert field.equals(iterator.field());
|
||||
if (termsEnum.seekExact(term)) {
|
||||
if (matchingTerms == null) {
|
||||
if (reader.maxDoc() == termsEnum.docFreq()) {
|
||||
return new WeightOrDocIdSet(DocIdSet.all(reader.maxDoc()));
|
||||
}
|
||||
docs = termsEnum.postings(docs, PostingsEnum.NONE);
|
||||
builder.add(docs);
|
||||
} else if (matchingTerms.size() < threshold) {
|
||||
|
@ -289,10 +292,16 @@ public class TermInSetQuery extends Query implements Accountable {
|
|||
} else {
|
||||
assert matchingTerms.size() == threshold;
|
||||
builder = new DocIdSetBuilder(reader.maxDoc(), terms);
|
||||
if (reader.maxDoc() == termsEnum.docFreq()) {
|
||||
return new WeightOrDocIdSet(DocIdSet.all(reader.maxDoc()));
|
||||
}
|
||||
docs = termsEnum.postings(docs, PostingsEnum.NONE);
|
||||
builder.add(docs);
|
||||
for (TermAndState t : matchingTerms) {
|
||||
t.termsEnum.seekExact(t.term, t.state);
|
||||
if (reader.maxDoc() == t.docFreq) {
|
||||
return new WeightOrDocIdSet(DocIdSet.all(reader.maxDoc()));
|
||||
}
|
||||
docs = t.termsEnum.postings(docs, PostingsEnum.NONE);
|
||||
builder.add(docs);
|
||||
}
|
||||
|
|
|
@ -19,12 +19,14 @@ package org.apache.lucene.search;
|
|||
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.StringField;
|
||||
|
@ -49,6 +51,48 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
|||
|
||||
public class TestTermInSetQuery extends LuceneTestCase {
|
||||
|
||||
public void testAllDocsTerm() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
|
||||
String field = "f";
|
||||
|
||||
BytesRef denseTerm = new BytesRef(TestUtil.randomAnalysisString(random(), 10, true));
|
||||
|
||||
Set<BytesRef> randomTerms = new HashSet<>();
|
||||
while (randomTerms.size() < TermInSetQuery.BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD) {
|
||||
randomTerms.add(new BytesRef(TestUtil.randomAnalysisString(random(), 10, true)));
|
||||
}
|
||||
assert randomTerms.size() == TermInSetQuery.BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD;
|
||||
BytesRef[] otherTerms = new BytesRef[randomTerms.size()];
|
||||
int idx = 0;
|
||||
for (BytesRef term : randomTerms) {
|
||||
otherTerms[idx++] = term;
|
||||
}
|
||||
|
||||
int numDocs = 10 * otherTerms.length;
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField(field, denseTerm, Store.NO));
|
||||
BytesRef sparseTerm = otherTerms[i % otherTerms.length];
|
||||
doc.add(new StringField(field, sparseTerm, Store.NO));
|
||||
iw.addDocument(doc);
|
||||
}
|
||||
|
||||
IndexReader reader = iw.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
iw.close();
|
||||
|
||||
List<BytesRef> queryTerms = Arrays.stream(otherTerms).collect(Collectors.toList());
|
||||
queryTerms.add(denseTerm);
|
||||
|
||||
TermInSetQuery query = new TermInSetQuery(field, queryTerms);
|
||||
TopDocs topDocs = searcher.search(query, numDocs);
|
||||
assertEquals(numDocs, topDocs.totalHits.value);
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testDuel() throws IOException {
|
||||
final int iters = atLeast(2);
|
||||
final String field = "f";
|
||||
|
|
Loading…
Reference in New Issue