LUCENE-1603: some improvements to MultiTermQuery/FilteredTermEnum

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@764930 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-04-14 19:35:43 +00:00
parent 87e10db563
commit 1699ee9d7f
3 changed files with 78 additions and 23 deletions

View File

@ -278,6 +278,12 @@ New features
turnaround than the normal approach of commiting the changes and
then reopening a reader. (Jason Rutherglen via Mike McCandless)
21. LUCENE-1603: Some improvements to MultiTermQuery: return
DocIdSet.EMPTY_DOCIDSET if there are no terms in the enum; track
the total number of terms it visited during rewrite
(getTotalNumberOfTerms). Also, FilteredTermEnum is now more
friendly to subclasses. (Uwe Schindler via Mike McCandless)
Optimizations
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing

View File

@ -26,8 +26,11 @@ import org.apache.lucene.index.TermEnum;
<p>Term enumerations are always ordered by Term.compareTo(). Each term in
the enumeration is greater than all that precede it. */
public abstract class FilteredTermEnum extends TermEnum {
private Term currentTerm = null;
private TermEnum actualEnum = null;
/** the current term */
protected Term currentTerm = null;
/** the delegate enum - to set this member use {@link #setEnum} */
protected TermEnum actualEnum = null;
public FilteredTermEnum() {}
@ -40,6 +43,10 @@ public abstract class FilteredTermEnum extends TermEnum {
/** Indicates the end of the enumeration has been reached */
protected abstract boolean endEnum();
/**
* use this method to set the actual TermEnum (e.g. in ctor),
* it will be automatically positioned on the first matching term.
*/
protected void setEnum(TermEnum actualEnum) throws IOException {
this.actualEnum = actualEnum;
// Find the first term that matches
@ -54,7 +61,8 @@ public abstract class FilteredTermEnum extends TermEnum {
* Returns -1 if no Term matches or all terms have been enumerated.
*/
public int docFreq() {
if (actualEnum == null) return -1;
if (currentTerm == null) return -1;
assert actualEnum != null;
return actualEnum.docFreq();
}
@ -85,7 +93,7 @@ public abstract class FilteredTermEnum extends TermEnum {
/** Closes the enumeration to further activity, freeing resources. */
public void close() throws IOException {
actualEnum.close();
if (actualEnum != null) actualEnum.close();
currentTerm = null;
actualEnum = null;
}

View File

@ -45,6 +45,7 @@ import org.apache.lucene.util.ToStringUtils;
public abstract class MultiTermQuery extends Query {
protected Term term;
protected boolean constantScoreRewrite = false;
transient int numberOfTerms = 0;
/** Constructs a query for terms matching <code>term</code>. */
public MultiTermQuery(Term term) {
@ -67,6 +68,34 @@ public abstract class MultiTermQuery extends Query {
protected abstract FilteredTermEnum getEnum(IndexReader reader)
throws IOException;
/**
* Expert: Return the number of unique terms visited during execution of the query.
* If there are many of them, you may consider using another query type
* or optimize your total term count in index.
* <p>This method is not thread safe, be sure to only call it when no query is running!
* If you re-use the same query instance for another
* search, be sure to first reset the term counter
* with {@link #clearTotalNumberOfTerms}.
* <p>On optimized indexes / no MultiReaders, you get the correct number of
* unique terms for the whole index. Use this number to compare different queries.
* For non-optimized indexes this number can also be achived in
* non-constant-score mode. In constant-score mode you get the total number of
* terms seeked for all segments / sub-readers.
* @see #clearTotalNumberOfTerms
*/
public int getTotalNumberOfTerms() {
return numberOfTerms;
}
/**
* Expert: Resets the counting of unique terms.
* Do this before executing the query/filter.
* @see #getTotalNumberOfTerms
*/
public void clearTotalNumberOfTerms() {
numberOfTerms = 0;
}
protected Filter getFilter() {
return new MultiTermFilter(this);
}
@ -79,6 +108,7 @@ public abstract class MultiTermQuery extends Query {
do {
Term t = enumerator.term();
if (t != null) {
numberOfTerms++;
TermQuery tq = new TermQuery(t); // found a match
tq.setBoost(getBoost() * enumerator.difference()); // set the boost
query.add(tq, BooleanClause.Occur.SHOULD); // add to query
@ -150,14 +180,14 @@ public abstract class MultiTermQuery extends Query {
MultiTermQuery mtq;
abstract class TermGenerator {
public void generate(IndexReader reader) throws IOException {
TermEnum enumerator = mtq.getEnum(reader);
public void generate(IndexReader reader, TermEnum enumerator) throws IOException {
TermDocs termDocs = reader.termDocs();
try {
do {
Term term = enumerator.term();
if (term == null)
break;
mtq.numberOfTerms++;
termDocs.seek(term);
while (termDocs.next()) {
handleDoc(termDocs.doc());
@ -165,7 +195,6 @@ public abstract class MultiTermQuery extends Query {
} while (enumerator.next());
} finally {
termDocs.close();
enumerator.close();
}
}
abstract public void handleDoc(int doc);
@ -176,28 +205,40 @@ public abstract class MultiTermQuery extends Query {
}
public BitSet bits(IndexReader reader) throws IOException {
final BitSet bitSet = new BitSet(reader.maxDoc());
new TermGenerator() {
public void handleDoc(int doc) {
bitSet.set(doc);
}
}.generate(reader);
return bitSet;
final TermEnum enumerator = mtq.getEnum(reader);
try {
final BitSet bitSet = new BitSet(reader.maxDoc());
new TermGenerator() {
public void handleDoc(int doc) {
bitSet.set(doc);
}
}.generate(reader, enumerator);
return bitSet;
} finally {
enumerator.close();
}
}
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
new TermGenerator() {
public void handleDoc(int doc) {
bitSet.set(doc);
}
}.generate(reader);
return bitSet;
final TermEnum enumerator = mtq.getEnum(reader);
try {
// if current term in enum is null, the enum is empty -> shortcut
if (enumerator.term() == null)
return DocIdSet.EMPTY_DOCIDSET;
// else fill into a OpenBitSet
final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
new TermGenerator() {
public void handleDoc(int doc) {
bitSet.set(doc);
}
}.generate(reader, enumerator);
return bitSet;
} finally {
enumerator.close();
}
}
public boolean equals(Object o) {
if (this == o)
return true;
if (!(o instanceof MultiTermFilter))