mirror of https://github.com/apache/lucene.git
TermInSetQuery optimization when all docs in a field match a term (#11828)
This commit is contained in:
parent
367cd2ea95
commit
44b4602776
|
@ -132,6 +132,10 @@ Optimizations
|
|||
* GITHUB#11803: DrillSidewaysScorer has improved to leverage "advance" instead of "next" where
|
||||
possible, and splits out first and second phase checks to delay match confirmation. (Greg Miller)
|
||||
|
||||
* GITHUB#11828: Tweak TermInSetQuery "dense" optimization to only require all terms present in a
|
||||
given field to match a term (rather than all docs in a segment). This is consistent with
|
||||
MultiTermQueryConstantScoreWrapper. (Greg Miller)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
|
||||
|
|
|
@ -266,6 +266,7 @@ public class TermInSetQuery extends Query implements Accountable {
|
|||
if (terms == null) {
|
||||
return null;
|
||||
}
|
||||
final int fieldDocCount = terms.getDocCount();
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
PostingsEnum docs = null;
|
||||
TermIterator iterator = termData.iterator();
|
||||
|
@ -281,8 +282,18 @@ public class TermInSetQuery extends Query implements Accountable {
|
|||
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
|
||||
assert field.equals(iterator.field());
|
||||
if (termsEnum.seekExact(term)) {
|
||||
if (reader.maxDoc() == termsEnum.docFreq()) {
|
||||
return new WeightOrDocIdSet(DocIdSet.all(reader.maxDoc()));
|
||||
// If a term contains all docs with a value for the specified field (likely rare),
|
||||
// we can discard the other terms and just use the dense term's postings:
|
||||
int docFreq = termsEnum.docFreq();
|
||||
if (fieldDocCount == docFreq) {
|
||||
TermStates termStates = new TermStates(searcher.getTopReaderContext());
|
||||
termStates.register(
|
||||
termsEnum.termState(), context.ord, docFreq, termsEnum.totalTermFreq());
|
||||
Query q =
|
||||
new ConstantScoreQuery(
|
||||
new TermQuery(new Term(field, termsEnum.term()), termStates));
|
||||
Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score());
|
||||
return new WeightOrDocIdSet(weight);
|
||||
}
|
||||
|
||||
if (matchingTerms == null) {
|
||||
|
@ -304,6 +315,7 @@ public class TermInSetQuery extends Query implements Accountable {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (matchingTerms != null) {
|
||||
assert builder == null;
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
|
|
|
@ -51,7 +51,7 @@ import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
|||
|
||||
public class TestTermInSetQuery extends LuceneTestCase {
|
||||
|
||||
public void testAllDocsTerm() throws IOException {
|
||||
public void testAllDocsInFieldTerm() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
|
||||
String field = "f";
|
||||
|
@ -69,6 +69,7 @@ public class TestTermInSetQuery extends LuceneTestCase {
|
|||
otherTerms[idx++] = term;
|
||||
}
|
||||
|
||||
// Every doc with a value for `field` will contain `denseTerm`:
|
||||
int numDocs = 10 * otherTerms.length;
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document doc = new Document();
|
||||
|
@ -78,6 +79,12 @@ public class TestTermInSetQuery extends LuceneTestCase {
|
|||
iw.addDocument(doc);
|
||||
}
|
||||
|
||||
// Make sure there are some docs in the index that don't contain a value for the field at all:
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new StringField("foo", "bar", Store.NO));
|
||||
}
|
||||
|
||||
IndexReader reader = iw.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
iw.close();
|
||||
|
|
Loading…
Reference in New Issue