From 734841d6c0f82aa170f882a7d1cb891e87d145df Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Mon, 26 Sep 2022 10:39:47 -0700 Subject: [PATCH] Optimize MultiTermQueryConstantScoreWrapper for case when a term matches all docs in a segment. (#11738) --- lucene/CHANGES.txt | 5 +++ .../MultiTermQueryConstantScoreWrapper.java | 40 ++++++++++++++----- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ea22d302723..da470df6e54 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -115,6 +115,11 @@ Bug Fixes trying to apply a dictionary whose size is greater than the maximum supported window size for LZ4. (Adrien Grand) +Optimizations +--------------------- +* GITHUB#11738: Optimize MultiTermQueryConstantScoreWrapper when a term is present that matches all + docs in a segment. (Greg Miller) + * GITHUB#11735: KeywordRepeatFilter + OpenNLPLemmatizer always drops last token of a stream. (Luke Kot-Zaniewski) diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java index f729ed6421c..1271b3c5571 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java @@ -125,11 +125,11 @@ final class MultiTermQueryConstantScoreWrapper extends return new ConstantScoreWeight(this, boost) { /** - * Try to collect terms from the given terms enum and return true iff all terms could be - * collected. If {@code false} is returned, the enum is left positioned on the next term. + * Try to collect terms from the given terms enum and return true if all terms could be + * collected or if one of the iterated terms contains all docs for the field. If {@code false} + * is returned, the enum is left positioned on the next term. */ - private boolean collectTerms( - LeafReaderContext context, TermsEnum termsEnum, List terms) + private boolean collectTerms(int fieldDocCount, TermsEnum termsEnum, List terms) throws IOException { final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, IndexSearcher.getMaxClauseCount()); @@ -139,12 +139,18 @@ final class MultiTermQueryConstantScoreWrapper extends return true; } TermState state = termsEnum.termState(); - terms.add( + int docFreq = termsEnum.docFreq(); + TermAndState termAndState = new TermAndState( - BytesRef.deepCopyOf(term), - state, - termsEnum.docFreq(), - termsEnum.totalTermFreq())); + BytesRef.deepCopyOf(term), state, docFreq, termsEnum.totalTermFreq()); + if (fieldDocCount == docFreq) { + // If the term contains every document with a value for the field, we can ignore all + // other terms: + terms.clear(); + terms.add(termAndState); + return true; + } + terms.add(termAndState); } return termsEnum.next() == null; } @@ -160,13 +166,14 @@ final class MultiTermQueryConstantScoreWrapper extends return new WeightOrDocIdSet((DocIdSet) null); } + final int fieldDocCount = terms.getDocCount(); final TermsEnum termsEnum = query.getTermsEnum(terms); assert termsEnum != null; PostingsEnum docs = null; final List collectedTerms = new ArrayList<>(); - if (collectTerms(context, termsEnum, collectedTerms)) { + if (collectTerms(fieldDocCount, termsEnum, collectedTerms)) { // build a boolean query BooleanQuery.Builder bq = new BooleanQuery.Builder(); for (TermAndState t : collectedTerms) { @@ -193,6 +200,19 @@ final class MultiTermQueryConstantScoreWrapper extends // Then keep filling the bit set with remaining terms do { docs = termsEnum.postings(docs, PostingsEnum.NONE); + // If a term contains all docs with a value for the specified field, we can discard the + // other terms and just use the dense term's postings: + int docFreq = termsEnum.docFreq(); + if (fieldDocCount == docFreq) { + TermStates termStates = new TermStates(searcher.getTopReaderContext()); + termStates.register( + termsEnum.termState(), context.ord, docFreq, termsEnum.totalTermFreq()); + Query q = + new ConstantScoreQuery( + new TermQuery(new Term(query.field, termsEnum.term()), termStates)); + Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score()); + return new WeightOrDocIdSet(weight); + } builder.add(docs); } while (termsEnum.next() != null);