LUCENE-6458: Multi-term queries matching few terms per segment now execute like a disjunction.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1680468 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2015-05-20 07:25:09 +00:00
parent bea194df80
commit 012bcc2077
3 changed files with 97 additions and 23 deletions

View File

@ -131,6 +131,9 @@ Optimizations
* LUCENE-6350: TermsQuery is now compressed with PrefixCodedTerms.
(Robert Muir, Mike McCandless, Adrien Grand)
* LUCENE-6458: Multi-term queries matching few terms per segment now execute
like a disjunction. (Adrien Grand)
Bug Fixes
* LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.

View File

@ -18,29 +18,48 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.util.Objects;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/**
* A wrapper for {@link MultiTermQuery}, that exposes its
* functionality as a {@link Filter}.
* <P>
* <code>MultiTermQueryWrapperFilter</code> is not designed to
* be used by itself. Normally you subclass it to provide a Filter
* counterpart for a {@link MultiTermQuery} subclass.
* <P>
* This class also provides the functionality behind
* {@link MultiTermQuery#CONSTANT_SCORE_REWRITE};
* this is why it is not abstract.
* {@link MultiTermQuery#CONSTANT_SCORE_REWRITE}.
* It tries to rewrite per-segment as a boolean query
* that returns a constant score and otherwise fills a
* bit set with matches and builds a Scorer on top of
* this bit set.
*/
final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends Query {
// mtq that matches 16 terms or less will be executed as a regular disjunction
private static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16;
private static class TermAndState {
final BytesRef term;
final TermState state;
final int docFreq;
final long totalTermFreq;
TermAndState(BytesRef term, TermState state, int docFreq, long totalTermFreq) {
this.term = term;
this.state = state;
this.docFreq = docFreq;
this.totalTermFreq = totalTermFreq;
}
}
protected final Q query;
/**
@ -57,20 +76,17 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
}
@Override
@SuppressWarnings({"rawtypes"})
public final boolean equals(final Object o) {
if (o==this) return true;
if (o==null) return false;
if (this.getClass().equals(o.getClass())) {
final MultiTermQueryConstantScoreWrapper that = (MultiTermQueryConstantScoreWrapper) o;
return this.query.equals(that.query) && this.getBoost() == that.getBoost();
if (super.equals(o) == false) {
return false;
}
return false;
final MultiTermQueryConstantScoreWrapper<?> that = (MultiTermQueryConstantScoreWrapper<?>) o;
return this.query.equals(that.query) && this.getBoost() == that.getBoost();
}
@Override
public final int hashCode() {
return Objects.hash(getClass(), query, getBoost());
return 31 * super.hashCode() + query.hashCode();
}
/** Returns the field name for this query */
@ -79,6 +95,22 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
return new ConstantScoreWeight(this) {
/** Try to collect terms from the given terms enum and return true iff all
* terms could be collected. If {@code false} is returned, the enum is
* left positioned on the next term. */
private boolean collectTerms(LeafReaderContext context, TermsEnum termsEnum, List<TermAndState> terms) throws IOException {
final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
for (int i = 0; i < threshold; ++i) {
final BytesRef term = termsEnum.next();
if (term == null) {
return true;
}
terms.add(new TermAndState(BytesRef.deepCopyOf(term), termsEnum.termState(), termsEnum.docFreq(), termsEnum.totalTermFreq()));
}
return termsEnum.next() == null;
}
@Override
public Scorer scorer(LeafReaderContext context, Bits acceptDocs) throws IOException {
final Terms terms = context.reader().terms(query.field);
@ -92,10 +124,37 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
BitDocIdSet.Builder builder = new BitDocIdSet.Builder(context.reader().maxDoc());
PostingsEnum docs = null;
while (termsEnum.next() != null) {
final List<TermAndState> collectedTerms = new ArrayList<>();
if (collectTerms(context, termsEnum, collectedTerms)) {
// build a boolean query
BooleanQuery bq = new BooleanQuery();
for (TermAndState t : collectedTerms) {
final TermContext termContext = new TermContext(searcher.getTopReaderContext());
termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
bq.add(new TermQuery(new Term(query.field, t.term), termContext), Occur.SHOULD);
}
Query q = new ConstantScoreQuery(bq);
q.setBoost(score());
return searcher.rewrite(q).createWeight(searcher, needsScores).scorer(context, acceptDocs);
}
// Too many terms: go back to the terms we already collected and start building the bit set
if (collectedTerms.isEmpty() == false) {
TermsEnum termsEnum2 = terms.iterator();
for (TermAndState t : collectedTerms) {
termsEnum2.seekExact(t.term, t.state);
docs = termsEnum2.postings(acceptDocs, docs, PostingsEnum.NONE);
builder.or(docs);
}
}
// Then keep filling the bit set with remaining terms
do {
docs = termsEnum.postings(acceptDocs, docs, PostingsEnum.NONE);
builder.or(docs);
}
} while (termsEnum.next() != null);
final BitDocIdSet set = builder.build();
if (set == null) {
return null;

View File

@ -56,9 +56,21 @@ public class TermQuery extends Query {
assert termStates != null : "TermContext must not be null";
this.termStates = termStates;
this.similarity = searcher.getSimilarity();
this.stats = similarity.computeWeight(getBoost(),
searcher.collectionStatistics(term.field()),
searcher.termStatistics(term, termStates));
final CollectionStatistics collectionStats;
final TermStatistics termStats;
if (needsScores) {
collectionStats = searcher.collectionStatistics(term.field());
termStats = searcher.termStatistics(term, termStates);
} else {
// do not bother computing actual stats, scores are not needed
final int maxDoc = searcher.getIndexReader().maxDoc();
final int docFreq = termStates.docFreq();
final long totalTermFreq = termStates.totalTermFreq();
collectionStats = new CollectionStatistics(term.field(), maxDoc, -1, -1, -1);
termStats = new TermStatistics(term.bytes(), docFreq, totalTermFreq);
}
this.stats = similarity.computeWeight(getBoost(), collectionStats, termStats);
}
@Override