LUCENE-3328: specialize ConjunctionScorer if all required clauses are TermQueries

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1149547 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2011-07-22 10:26:58 +00:00
parent cf0e50f71d
commit 3100c0517a
4 changed files with 177 additions and 9 deletions

View File

@ -494,6 +494,10 @@ Optimizations
instance for merging and NRT readers, which enables directory impls
to separately tune IO flags for each. (Varun Thacker, Simon
Willnauer, Mike McCandless)
* LUCENE-3328: BooleanQuery now uses a specialized ConjunctionScorer if all
boolean clauses are required and instances of TermQuery.
(Simon Willnauer, Robert Muir)
Bug fixes

View File

@ -18,10 +18,14 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.ConjunctionTermScorer.DocsAndFreqs;
import org.apache.lucene.search.Similarity.ExactDocScorer;
import org.apache.lucene.search.TermQuery.TermWeight;
import java.io.IOException;
import java.util.*;
@ -166,17 +170,24 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
protected ArrayList<Weight> weights;
protected int maxCoord; // num optional + num required
private final boolean disableCoord;
private final boolean termConjunction;
public BooleanWeight(IndexSearcher searcher, boolean disableCoord)
throws IOException {
this.similarityProvider = searcher.getSimilarityProvider();
this.disableCoord = disableCoord;
weights = new ArrayList<Weight>(clauses.size());
boolean termConjunction = clauses.isEmpty() || minNrShouldMatch != 0 ? false : true;
for (int i = 0 ; i < clauses.size(); i++) {
BooleanClause c = clauses.get(i);
weights.add(c.getQuery().createWeight(searcher));
Weight w = c.getQuery().createWeight(searcher);
if (!(c.isRequired() && (w instanceof TermWeight))) {
termConjunction = false;
}
weights.add(w);
if (!c.isProhibited()) maxCoord++;
}
this.termConjunction = termConjunction;
}
@Override
@ -290,6 +301,10 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext)
throws IOException {
if (termConjunction) {
// specialized scorer for term conjunctions
return createConjunctionTermScorer(context);
}
List<Scorer> required = new ArrayList<Scorer>();
List<Scorer> prohibited = new ArrayList<Scorer>();
List<Scorer> optional = new ArrayList<Scorer>();
@ -328,6 +343,23 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
// Return a BooleanScorer2
return new BooleanScorer2(this, disableCoord, minNrShouldMatch, required, prohibited, optional, maxCoord);
}
private Scorer createConjunctionTermScorer(AtomicReaderContext context)
throws IOException {
final DocsAndFreqs[] docsAndFreqs = new DocsAndFreqs[weights.size()];
for (int i = 0; i < docsAndFreqs.length; i++) {
final TermWeight weight = (TermWeight) weights.get(i);
final TermsEnum termsEnum = weight.getTermsEnum(context);
if (termsEnum == null) {
return null;
}
final ExactDocScorer docScorer = weight.createDocScorer(context);
docsAndFreqs[i] = new DocsAndFreqs(termsEnum.docs(
context.reader.getLiveDocs(), null), termsEnum.docFreq(), docScorer);
}
return new ConjunctionTermScorer(this, disableCoord ? 1.0f : coord(
docsAndFreqs.length, docsAndFreqs.length), docsAndFreqs);
}
@Override
public boolean scoresDocsOutOfOrder() {

View File

@ -0,0 +1,110 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.search.Similarity.ExactDocScorer;
import org.apache.lucene.util.ArrayUtil;
import java.io.IOException;
import java.util.Comparator;
/** Scorer for conjunctions, sets of terms, all of which are required. */
final class ConjunctionTermScorer extends Scorer {
private final float coord;
private int lastDoc = -1;
private final DocsAndFreqs[] docsAndFreqs;
private final DocsAndFreqs lead;
ConjunctionTermScorer(Weight weight, float coord,
DocsAndFreqs[] docsAndFreqs) throws IOException {
super(weight);
this.coord = coord;
this.docsAndFreqs = docsAndFreqs;
// Sort the array the first time to allow the least frequent DocsEnum to
// lead the matching.
ArrayUtil.mergeSort(docsAndFreqs, new Comparator<DocsAndFreqs>() {
public int compare(DocsAndFreqs o1, DocsAndFreqs o2) {
return o1.freq - o2.freq;
}
});
lead = docsAndFreqs[0]; // least frequent DocsEnum leads the intersection
}
private int doNext(int doc) throws IOException {
do {
if (lead.doc == DocsEnum.NO_MORE_DOCS) {
return NO_MORE_DOCS;
}
advanceHead: do {
for (int i = 1; i < docsAndFreqs.length; i++) {
if (docsAndFreqs[i].doc < doc) {
docsAndFreqs[i].doc = docsAndFreqs[i].docs.advance(doc);
}
if (docsAndFreqs[i].doc > doc) {
// DocsEnum beyond the current doc - break and advance lead
break advanceHead;
}
}
// success - all DocsEnums are on the same doc
return doc;
} while (true);
// advance head for next iteration
doc = lead.doc = lead.docs.nextDoc();
} while (true);
}
@Override
public int advance(int target) throws IOException {
lead.doc = lead.docs.advance(target);
return lastDoc = doNext(lead.doc);
}
@Override
public int docID() {
return lastDoc;
}
@Override
public int nextDoc() throws IOException {
lead.doc = lead.docs.nextDoc();
return lastDoc = doNext(lead.doc);
}
@Override
public float score() throws IOException {
float sum = 0.0f;
for (DocsAndFreqs docs : docsAndFreqs) {
sum += docs.docScorer.score(lastDoc, docs.docs.freq());
}
return sum * coord;
}
static final class DocsAndFreqs {
final DocsEnum docs;
final int freq;
final ExactDocScorer docScorer;
int doc = -1;
DocsAndFreqs(DocsEnum docs, int freq, ExactDocScorer docScorer) {
this.docs = docs;
this.freq = freq;
this.docScorer = docScorer;
}
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term;
@ -41,7 +42,7 @@ public class TermQuery extends Query {
private int docFreq;
private transient TermContext perReaderTermState;
private class TermWeight extends Weight {
final class TermWeight extends Weight {
private final Similarity similarity;
private final Similarity.Stats stats;
private transient TermContext termStates;
@ -72,17 +73,38 @@ public class TermQuery extends Query {
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
final String field = term.field();
final IndexReader reader = context.reader;
assert termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
final TermState state = termStates.get(context.ord);
if (state == null) { // term is not present in that reader
assert termNotInReader(reader, field, term.bytes()) : "no termstate found but term exists in reader";
final TermsEnum termsEnum = getTermsEnum(context);
if (termsEnum == null) {
return null;
}
final DocsEnum docs = reader.termDocsEnum(reader.getLiveDocs(), field, term.bytes(), state);
// TODO should we reuse the DocsEnum here?
final DocsEnum docs = termsEnum.docs(context.reader.getLiveDocs(), null);
assert docs != null;
return new TermScorer(this, docs, similarity.exactDocScorer(stats, field, context));
return new TermScorer(this, docs, createDocScorer(context));
}
/**
* Creates an {@link ExactDocScorer} for this {@link TermWeight}*/
ExactDocScorer createDocScorer(AtomicReaderContext context)
throws IOException {
return similarity.exactDocScorer(stats, term.field(), context);
}
/**
* Returns a {@link TermsEnum} positioned at this weights Term or null if
* the term does not exist in the given context
*/
TermsEnum getTermsEnum(AtomicReaderContext context) throws IOException {
final TermState state = termStates.get(context.ord);
if (state == null) { // term is not present in that reader
assert termNotInReader(context.reader, term.field(), term.bytes()) : "no termstate found but term exists in reader";
return null;
}
final TermsEnum termsEnum = context.reader.terms(term.field())
.getThreadTermsEnum();
termsEnum.seekExact(term.bytes(), state);
return termsEnum;
}
private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException {