Optimize MultiTermQueryConstantScoreWrapper for case when a term matches all docs in a segment. (#11738)

This commit is contained in:
Greg Miller 2022-09-26 10:39:47 -07:00 committed by GitHub
parent ac12cd9f17
commit 734841d6c0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 35 additions and 10 deletions

View File

@ -115,6 +115,11 @@ Bug Fixes
trying to apply a dictionary whose size is greater than the maximum supported trying to apply a dictionary whose size is greater than the maximum supported
window size for LZ4. (Adrien Grand) window size for LZ4. (Adrien Grand)
Optimizations
---------------------
* GITHUB#11738: Optimize MultiTermQueryConstantScoreWrapper when a term is present that matches all
docs in a segment. (Greg Miller)
* GITHUB#11735: KeywordRepeatFilter + OpenNLPLemmatizer always drops last token of a stream. * GITHUB#11735: KeywordRepeatFilter + OpenNLPLemmatizer always drops last token of a stream.
(Luke Kot-Zaniewski) (Luke Kot-Zaniewski)

View File

@ -125,11 +125,11 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
return new ConstantScoreWeight(this, boost) { return new ConstantScoreWeight(this, boost) {
/** /**
* Try to collect terms from the given terms enum and return true iff all terms could be * Try to collect terms from the given terms enum and return true if all terms could be
* collected. If {@code false} is returned, the enum is left positioned on the next term. * collected or if one of the iterated terms contains all docs for the field. If {@code false}
* is returned, the enum is left positioned on the next term.
*/ */
private boolean collectTerms( private boolean collectTerms(int fieldDocCount, TermsEnum termsEnum, List<TermAndState> terms)
LeafReaderContext context, TermsEnum termsEnum, List<TermAndState> terms)
throws IOException { throws IOException {
final int threshold = final int threshold =
Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, IndexSearcher.getMaxClauseCount()); Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, IndexSearcher.getMaxClauseCount());
@ -139,12 +139,18 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
return true; return true;
} }
TermState state = termsEnum.termState(); TermState state = termsEnum.termState();
terms.add( int docFreq = termsEnum.docFreq();
TermAndState termAndState =
new TermAndState( new TermAndState(
BytesRef.deepCopyOf(term), BytesRef.deepCopyOf(term), state, docFreq, termsEnum.totalTermFreq());
state, if (fieldDocCount == docFreq) {
termsEnum.docFreq(), // If the term contains every document with a value for the field, we can ignore all
termsEnum.totalTermFreq())); // other terms:
terms.clear();
terms.add(termAndState);
return true;
}
terms.add(termAndState);
} }
return termsEnum.next() == null; return termsEnum.next() == null;
} }
@ -160,13 +166,14 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
return new WeightOrDocIdSet((DocIdSet) null); return new WeightOrDocIdSet((DocIdSet) null);
} }
final int fieldDocCount = terms.getDocCount();
final TermsEnum termsEnum = query.getTermsEnum(terms); final TermsEnum termsEnum = query.getTermsEnum(terms);
assert termsEnum != null; assert termsEnum != null;
PostingsEnum docs = null; PostingsEnum docs = null;
final List<TermAndState> collectedTerms = new ArrayList<>(); final List<TermAndState> collectedTerms = new ArrayList<>();
if (collectTerms(context, termsEnum, collectedTerms)) { if (collectTerms(fieldDocCount, termsEnum, collectedTerms)) {
// build a boolean query // build a boolean query
BooleanQuery.Builder bq = new BooleanQuery.Builder(); BooleanQuery.Builder bq = new BooleanQuery.Builder();
for (TermAndState t : collectedTerms) { for (TermAndState t : collectedTerms) {
@ -193,6 +200,19 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
// Then keep filling the bit set with remaining terms // Then keep filling the bit set with remaining terms
do { do {
docs = termsEnum.postings(docs, PostingsEnum.NONE); docs = termsEnum.postings(docs, PostingsEnum.NONE);
// If a term contains all docs with a value for the specified field, we can discard the
// other terms and just use the dense term's postings:
int docFreq = termsEnum.docFreq();
if (fieldDocCount == docFreq) {
TermStates termStates = new TermStates(searcher.getTopReaderContext());
termStates.register(
termsEnum.termState(), context.ord, docFreq, termsEnum.totalTermFreq());
Query q =
new ConstantScoreQuery(
new TermQuery(new Term(query.field, termsEnum.term()), termStates));
Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score());
return new WeightOrDocIdSet(weight);
}
builder.add(docs); builder.add(docs);
} while (termsEnum.next() != null); } while (termsEnum.next() != null);