LUCENE-124: Add a topterms rewrite method that scores terms only by boost value

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@920499 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-03-08 20:37:55 +00:00
parent d9ff09b759
commit 8c0dcbb8c3
3 changed files with 123 additions and 17 deletions

View File

@ -77,6 +77,12 @@ API Changes
callers can use to query whether a certain file matches a certain extension.
(Shai Erera via Mike McCandless)
* LUCENE-124: Add a TopTermsBoostOnlyBooleanQueryRewrite to MultiTermQuery.
This rewrite method is similar to TopTermsScoringBooleanQueryRewrite, but
only scores terms by their boost values. For example, this can be used
with FuzzyQuery to ensure that exact matches are always scored higher,
because only the boost will be used in scoring. (Robert Muir)
Bug fixes
* LUCENE-2119: Don't throw NegativeArraySizeException if you pass

View File

@ -162,38 +162,36 @@ public abstract class MultiTermQuery extends Query {
* @see #setRewriteMethod */
public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite();
/** A rewrite method that first translates each term into
* {@link BooleanClause.Occur#SHOULD} clause in a
* BooleanQuery, and keeps the scores as computed by the
* query.
*
* <p>This rewrite mode only uses the top scoring terms
* so it will not overflow the boolean max clause count.
* It is the default rewrite mode for {@link FuzzyQuery}.
*
* @see #setRewriteMethod */
public static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite {
/**
* Base rewrite method for collecting only the top terms
* via a priority queue.
*/
public static abstract class TopTermsBooleanQueryRewrite extends BooleanQueryRewrite {
private final int size;
/**
* Create a TopTermsScoringBooleanQueryRewrite for
* Create a TopTermsBooleanQueryRewrite for
* at most <code>size</code> terms.
* <p>
* NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
* <code>size</code>, then it will be used instead.
*/
public TopTermsScoringBooleanQueryRewrite(int size) {
public TopTermsBooleanQueryRewrite(int size) {
this.size = size;
}
/**
* Create a TopTermsScoringBooleanQueryRewrite that is limited
* Create a TopTermsBooleanQueryRewrite that is limited
* to at most {@link BooleanQuery#getMaxClauseCount} terms.
*/
public TopTermsScoringBooleanQueryRewrite() {
public TopTermsBooleanQueryRewrite() {
this(Integer.MAX_VALUE);
}
/** Return a suitable Query for a MultiTermQuery term. */
protected abstract Query getQuery(Term term);
@Override
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
@ -218,7 +216,7 @@ public abstract class MultiTermQuery extends Query {
final BooleanQuery bq = new BooleanQuery(true);
for (final ScoreTerm st : stQueue) {
TermQuery tq = new TermQuery(st.term); // found a match
Query tq = getQuery(st.term); // found a match
tq.setBoost(query.getBoost() * st.boost); // set the boost
bq.add(tq, BooleanClause.Occur.SHOULD); // add to query
}
@ -239,7 +237,7 @@ public abstract class MultiTermQuery extends Query {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
TopTermsScoringBooleanQueryRewrite other = (TopTermsScoringBooleanQueryRewrite) obj;
TopTermsBooleanQueryRewrite other = (TopTermsBooleanQueryRewrite) obj;
if (size != other.size) return false;
return true;
}
@ -257,6 +255,84 @@ public abstract class MultiTermQuery extends Query {
}
}
/**
* A rewrite method that first translates each term into
* {@link BooleanClause.Occur#SHOULD} clause in a BooleanQuery, and keeps the
* scores as computed by the query.
*
* <p>
* This rewrite mode only uses the top scoring terms so it will not overflow
* the boolean max clause count. It is the default rewrite mode for
* {@link FuzzyQuery}.
*
* @see #setRewriteMethod
*/
public static final class TopTermsScoringBooleanQueryRewrite extends
TopTermsBooleanQueryRewrite {
/**
* Create a TopTermsScoringBooleanQueryRewrite that is limited
* to at most {@link BooleanQuery#getMaxClauseCount} terms.
*/
public TopTermsScoringBooleanQueryRewrite() {
super();
}
/**
* Create a TopTermsScoringBooleanQueryRewrite for
* at most <code>size</code> terms.
* <p>
* NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
* <code>size</code>, then it will be used instead.
*/
public TopTermsScoringBooleanQueryRewrite(int size) {
super(size);
}
@Override
protected Query getQuery(Term term) {
return new TermQuery(term);
}
}
/**
* A rewrite method that first translates each term into
* {@link BooleanClause.Occur#SHOULD} clause in a BooleanQuery, but the scores
* are only computed as the boost.
* <p>
* This rewrite method only uses the top scoring terms so it will not overflow
* the boolean max clause count.
*
* @see #setRewriteMethod
*/
public static final class TopTermsBoostOnlyBooleanQueryRewrite extends
TopTermsBooleanQueryRewrite {
/**
* Create a TopTermsBoostOnlyBooleanQueryRewrite that is limited
* to at most {@link BooleanQuery#getMaxClauseCount} terms.
*/
public TopTermsBoostOnlyBooleanQueryRewrite() {
super();
}
/**
* Create a TopTermsBoostOnlyBooleanQueryRewrite for
* at most <code>size</code> terms.
* <p>
* NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
* <code>size</code>, then it will be used instead.
*/
public TopTermsBoostOnlyBooleanQueryRewrite(int size) {
super(size);
}
@Override
protected Query getQuery(Term term) {
return new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term)));
}
}
private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable {
@Override
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {

View File

@ -311,6 +311,30 @@ public class TestFuzzyQuery extends LuceneTestCase {
assertEquals(0, hits.length);
}
/** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
public void testBoostOnlyRewrite() throws Exception {
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(TEST_VERSION_CURRENT),
true, IndexWriter.MaxFieldLength.LIMITED);
addDoc("Lucene", writer);
addDoc("Lucene", writer);
addDoc("Lucenne", writer);
writer.optimize();
writer.close();
IndexSearcher searcher = new IndexSearcher(directory, true);
IndexReader reader = searcher.getIndexReader();
FuzzyQuery query = new FuzzyQuery(new Term("field", "Lucene"));
query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite());
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
// normally, 'Lucenne' would be the first result as IDF will skew the score.
assertEquals("Lucene", reader.document(hits[0].doc).get("field"));
assertEquals("Lucene", reader.document(hits[1].doc).get("field"));
assertEquals("Lucenne", reader.document(hits[2].doc).get("field"));
searcher.close();
reader.close();
}
public void testGiga() throws Exception {
StandardAnalyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT);