new DisjunctionMaxQuery/DisjunctionMaxScorer from Chuck Williams: LUCENE-323

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@345089 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2005-11-16 18:47:37 +00:00
parent c40c7cd114
commit 3c10854e00
4 changed files with 919 additions and 0 deletions

View File

@ -183,6 +183,10 @@ New features
must match in a BooleanQuery. See BooleanQuery.setMinimumNumberShouldMatch(). must match in a BooleanQuery. See BooleanQuery.setMinimumNumberShouldMatch().
(Paul Elschot, Chris Hostetter via Yonik Seeley, LUCENE-395) (Paul Elschot, Chris Hostetter via Yonik Seeley, LUCENE-395)
27. Added DisjunctionMaxQuery which provides the maximum score across it's clauses.
It's very useful for searching across multiple fields.
(Chuck Williams via Yonik Seeley, LUCENE-323)
API Changes API Changes
1. Several methods and fields have been deprecated. The API documentation 1. Several methods and fields have been deprecated. The API documentation

View File

@ -0,0 +1,246 @@
package org.apache.lucene.search;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
/**
* A query that generates the union of the documents produced by its subqueries, and that scores each document as the maximum
* score for that document produced by any subquery plus a tie breaking increment for any additional matching subqueries.
* This is useful to search for a word in multiple fields with different boost factors (so that the fields cannot be
* combined equivalently into a single search field). We want the primary score to be the one associated with the highest boost,
* not the sum of the field scores (as BooleanQuery would give).
* If the query is "albino elephant" this ensures that "albino" matching one field and "elephant" matching
* another gets a higher score than "albino" matching both fields.
* To get this result, use both BooleanQuery and DisjunctionMaxQuery: for each term a DisjunctionMaxQuery searches for it in
* each field, while the set of these DisjunctionMaxQuery's is combined into a BooleanQuery.
* The tie breaker capability allows results that include the same term in multiple fields to be judged better than results that
* include this term in only the best of those multiple fields, without confusing this with the better case of two different terms
* in the multiple fields.
* @author Chuck Williams
*/
public class DisjunctionMaxQuery extends Query implements Iterable {
/* The subqueries */
private ArrayList disjuncts = new ArrayList();
/* Multiple of the non-max disjunct scores added into our final score. Non-zero values support tie-breaking. */
private float tieBreakerMultiplier = 0.0f;
/** Creates a new empty DisjunctionMaxQuery. Use add() to add the subqueries.
* @param tieBreakerMultiplier this score of each non-maximum disjunct for a document is multiplied by this weight
* and added into the final score. If non-zero, the value should be small, on the order of 0.1, which says that
* 10 occurrences of word in a lower-scored field that is also in a higher scored field is just as good as a unique
* word in the lower scored field (i.e., one that is not in any higher scored field.
*/
public DisjunctionMaxQuery(float tieBreakerMultiplier) {
this.tieBreakerMultiplier = tieBreakerMultiplier;
}
/**
* Creates a new DisjunctionMaxQuery
* @param disjuncts an Iterable<Query> of all the disjuncts to add
* @param tieBreakerMultiplier the weight to give to each matching non-maximum disjunct
*/
public DisjunctionMaxQuery(Iterable disjuncts, float tieBreakerMultiplier) {
this.tieBreakerMultiplier = tieBreakerMultiplier;
add(disjuncts);
}
/** Add a subquery to this disjunction
* @param query the disjunct added
*/
public void add(Query query) {
disjuncts.add(query);
}
/** Add a collection of disjuncts to this disjunction
* via Iterable<Query>
*/
public void add(Iterable disjuncts) {
Iterator i = disjuncts.iterator();
while (i.hasNext()) add((Query)i.next());
}
/** An Iterator<Query> over the disjuncts */
public Iterator iterator() {
return disjuncts.iterator();
}
/* The Weight for DisjunctionMaxQuery's, used to normalize, score and explain these queries */
private class DisjunctionMaxWeight implements Weight {
private Searcher searcher; // The searcher with which we are associated.
private ArrayList weights = new ArrayList(); // The Weight's for our subqueries, in 1-1 correspondence with disjuncts
/* Construct the Weight for this Query searched by searcher. Recursively construct subquery weights. */
public DisjunctionMaxWeight(Searcher searcher) throws IOException {
this.searcher = searcher;
for (int i = 0; i < disjuncts.size(); i++)
weights.add(((Query) disjuncts.get(i)).createWeight(searcher));
}
/* Return our associated DisjunctionMaxQuery */
public Query getQuery() { return DisjunctionMaxQuery.this; }
/* Return our boost */
public float getValue() { return getBoost(); }
/* Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */
public float sumOfSquaredWeights() throws IOException {
float max = 0.0f, sum = 0.0f;
for (int i = 0; i < weights.size(); i++) {
float sub = ((Weight) weights.get(i)).sumOfSquaredWeights();
sum += sub;
max = Math.max(max, sub);
}
return (((sum - max) * tieBreakerMultiplier * tieBreakerMultiplier) + max) * getBoost() * getBoost();
}
/* Apply the computed normalization factor to our subqueries */
public void normalize(float norm) {
norm *= getBoost(); // Incorporate our boost
for (int i = 0 ; i < weights.size(); i++)
((Weight) weights.get(i)).normalize(norm);
}
/* Create the scorer used to score our associated DisjunctionMaxQuery */
public Scorer scorer(IndexReader reader) throws IOException {
DisjunctionMaxScorer result = new DisjunctionMaxScorer(tieBreakerMultiplier, getSimilarity(searcher));
for (int i = 0 ; i < weights.size(); i++) {
Weight w = (Weight) weights.get(i);
Scorer subScorer = w.scorer(reader);
if (subScorer == null) return null;
result.add(subScorer);
}
return result;
}
/* Explain the score we computed for doc */
public Explanation explain(IndexReader reader, int doc) throws IOException {
if ( disjuncts.size() == 1) return ((Weight) weights.get(0)).explain(reader,doc);
Explanation result = new Explanation();
float max = 0.0f, sum = 0.0f;
result.setDescription(tieBreakerMultiplier == 0.0f ? "max of:" : "max plus " + tieBreakerMultiplier + " times others of:");
for (int i = 0 ; i < weights.size(); i++) {
Explanation e = ((Weight) weights.get(i)).explain(reader, doc);
if (e.getValue() > 0) {
result.addDetail(e);
sum += e.getValue();
max = Math.max(max, e.getValue());
}
}
result.setValue(max + (sum - max)*tieBreakerMultiplier);
return result;
}
} // end of DisjunctionMaxWeight inner class
/* Create the Weight used to score us */
protected Weight createWeight(Searcher searcher) throws IOException {
return new DisjunctionMaxWeight(searcher);
}
/** Optimize our representation and our subqueries representations
* @param reader the IndexReader we query
* @return an optimized copy of us (which may not be a copy if there is nothing to optimize) */
public Query rewrite(IndexReader reader) throws IOException {
if (disjuncts.size() == 1) {
Query singleton = (Query) disjuncts.get(0);
Query result = singleton.rewrite(reader);
if (getBoost() != 1.0f) {
if (result == singleton) result = (Query)result.clone();
result.setBoost(getBoost() * result.getBoost());
}
return result;
}
DisjunctionMaxQuery clone = null;
for (int i = 0 ; i < disjuncts.size(); i++) {
Query clause = (Query) disjuncts.get(i);
Query rewrite = clause.rewrite(reader);
if (rewrite != clause) {
if (clone == null) clone = (DisjunctionMaxQuery)this.clone();
clone.disjuncts.set(i, rewrite);
}
}
if (clone != null) return clone;
else return this;
}
/** Create a shallow copy of us -- used in rewriting if necessary
* @return a copy of us (but reuse, don't copy, our subqueries) */
public Object clone() {
DisjunctionMaxQuery clone = (DisjunctionMaxQuery)super.clone();
clone.disjuncts = (ArrayList)this.disjuncts.clone();
return clone;
}
/** Prettyprint us.
* @param field the field to which we are applied
* @return a string that shows what we do, of the form "(disjunct1 | disjunct2 | ... | disjunctn)^boost"
*/
public String toString(String field) {
StringBuffer buffer = new StringBuffer();
buffer.append("(");
for (int i = 0 ; i < disjuncts.size(); i++) {
Query subquery = (Query) disjuncts.get(i);
if (subquery instanceof BooleanQuery) { // wrap sub-bools in parens
buffer.append("(");
buffer.append(subquery.toString(field));
buffer.append(")");
}
else buffer.append(subquery.toString(field));
if (i != disjuncts.size()-1) buffer.append(" | ");
}
buffer.append(")");
if (tieBreakerMultiplier != 0.0f) {
buffer.append("~");
buffer.append(tieBreakerMultiplier);
}
if (getBoost() != 1.0) {
buffer.append("^");
buffer.append(getBoost());
}
return buffer.toString();
}
/** Return true iff we represent the same query as o
* @param o another object
* @return true iff o is a DisjunctionMaxQuery with the same boost and the same subqueries, in the same order, as us
*/
public boolean equals(Object o) {
if (! (o instanceof DisjunctionMaxQuery) ) return false;
DisjunctionMaxQuery other = (DisjunctionMaxQuery)o;
return this.getBoost() == other.getBoost()
&& this.tieBreakerMultiplier == other.tieBreakerMultiplier
&& this.disjuncts.equals(other.disjuncts);
}
/** Compute a hash code for hashing us
* @return the hash code
*/
public int hashCode() {
return Float.floatToIntBits(getBoost())
+ Float.floatToIntBits(tieBreakerMultiplier)
+ disjuncts.hashCode();
}
}

View File

@ -0,0 +1,214 @@
package org.apache.lucene.search;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
/**
* The Scorer for DisjunctionMaxQuery's. The union of all documents generated by the the subquery scorers
* is generated in document number order. The score for each document is the maximum of the scores computed
* by the subquery scorers that generate that document, plus tieBreakerMultiplier times the sum of the scores
* for the other subqueries that generate the document.
* @author Chuck Williams
*/
class DisjunctionMaxScorer extends Scorer {
/* The scorers for subqueries that have remaining docs, kept sorted by number of next doc. */
private ArrayList subScorers = new ArrayList();
/* Multiplier applied to non-maximum-scoring subqueries for a document as they are summed into the result. */
private float tieBreakerMultiplier;
private boolean more = false; // True iff there is a next document
private boolean firstTime = true; // True iff next() has not yet been called
/* Comparator to sort subScorers according to the document number of next document */
private static class DisjunctionMaxClauseComparator implements Comparator {
/* Scorers have all been positioned at their next document already */
public int compare(Object o1, Object o2) {
if (o1 instanceof Scorer && o2 instanceof Scorer) {
Scorer s1 = (Scorer) o1;
Scorer s2 = (Scorer) o2;
return s1.doc() - s2.doc();
}
else {
throw new ClassCastException("Objects not of the type 'Scorer'");
}
}
/* Compatible equality */
public boolean equals(Scorer s1, Scorer s2) {
return s1.doc() == s2.doc();
}
}
/* Fixed instance of the comparator to reuse */
private static DisjunctionMaxClauseComparator subScorerComparator = new DisjunctionMaxClauseComparator();
/** Creates a new instance of DisjunctionMaxScorer
* @param similarity -- not used since our definition involves neither coord nor terms directly */
public DisjunctionMaxScorer(float tieBreakerMultiplier, Similarity similarity) {
super(similarity);
this.tieBreakerMultiplier = tieBreakerMultiplier;
}
/** Add the scorer for a subquery
* @param scorer the scorer of a subquery of our associated DisjunctionMaxQuery
*/
public void add(Scorer scorer) throws IOException {
if ( scorer.next() ) { // Initialize and retain only if it produces docs
subScorers.add(scorer);
more = true;
}
}
/* First time initialization. Sort subScorers. */
private void init() {
sortSubScorers();
firstTime = false;
}
/* Sort subScorers in order of document number of next document to be generated */
private void sortSubScorers() {
Scorer[] sorted = (Scorer[]) subScorers.toArray(new Scorer[subScorers.size()]);
Arrays.sort(sorted, subScorerComparator);
for (int i=0; i<sorted.length; i++) subScorers.set(i, sorted[i]);
}
/** Generate the next document matching our associated DisjunctionMaxQuery.
* @return true iff there is a next document
*/
public boolean next() throws IOException {
if ( !more ) return false;
if ( firstTime ) {
init();
return true; // more would have been false if no subScorers had any docs
}
// Increment all generators that generated the last doc and incrementally re-sort.
int lastdoc = ((Scorer) subScorers.get(0)).doc();
do {
if ( ((Scorer) subScorers.get(0)).next() ) {
Scorer s = (Scorer) subScorers.get(0);
int snextdoc = s.doc(), i=1;
for (; i<subScorers.size() && snextdoc > ((Scorer) subScorers.get(i)).doc(); i++)
subScorers.set(i-1, subScorers.get(i));
if ( i!=1 ) subScorers.set(i-1, s);
} else {
subScorers.remove(0);
if ( subScorers.isEmpty() ) return (more = false);
}
} while ( ((Scorer) subScorers.get(0)).doc()==lastdoc );
return true;
}
/** Determine the current document number. Initially invalid, until {@link #next()} is called the first time.
* @return the document number of the currently generated document
*/
public int doc() {
return ((Scorer) subScorers.get(0)).doc();
}
/** Determine the current document score. Initially invalid, until {@link #next()} is called the first time.
* @return the score of the current generated document
*/
public float score() throws IOException {
float max = ((Scorer) subScorers.get(0)).score(), sum = max;
for (int i = 1, doc = ((Scorer) subScorers.get(0)).doc(); i < subScorers.size() && ((Scorer) subScorers.get(i)).doc() == doc; i++) {
float sub = ((Scorer) subScorers.get(i)).score();
sum += sub;
max = Math.max(max, sub);
}
return max + (sum - max)*tieBreakerMultiplier;
}
/** Advance to the first document beyond the current whose number is greater than or equal to target.
* @param target the minimum number of the next desired document
* @return true iff there is a document to be generated whose number is at least target
*/
public boolean skipTo(int target) throws IOException {
int i=0;
while ( i<subScorers.size() ) {
if ( ((Scorer) subScorers.get(i)).doc() < target ) {
if ( ((Scorer) subScorers.get(i)).skipTo(target) ) i++;
else subScorers.remove(i);
} else i++;
}
if ( i == 0 ) return false;
sortSubScorers();
return true;
}
/** Explain a score that we computed. UNSUPPORTED -- see explanation capability in DisjunctionMaxQuery.
* @param doc the number of a document we scored
* @return the Explanation for our score
*/
public Explanation explain(int doc) throws IOException {
throw new UnsupportedOperationException();
}
}
/***************************************************************************
Implementation notes from http://issues.apache.org/jira/browse/LUCENE-323
There is an issue with the MaxDisjunctionScorer in the .zip attachment, I'm
sorry I did not see this earlier when I posted on java-dev about this.
The problem is that MaxDisjunctionScorer uses bubble sort to keep the subscorer
sorted over the documents in the next() method (line 103), and this does not scale nicely
when the number of subscorers increases.
Supposing the number of subscores that match the document is N,
the amount of work to be done is proportional to (N*N) per document.
In DisjunctionSumScorer a priority queue is used, and there the amount of work is
proportional to (N log(N)) per document.
So I would recommend to rewrite MaxDisjunctionScorer to inherit from a new common
super class with DisjunctionSumScorer, sharing everything except the
advanceAfterCurrent() method (which could be abstract in the new superclass).
It's possible to be more aggressive in refactoring by initializing and adapting
the score per index document using different methods, but this would take N
extra method calls per document.
At the same time the name could be changed to DisjunctionMaxScorer
for consistency in the org.lucene.search package.
Regards,
Paul Elschot
Comment by Chuck Williams [14/Nov/05 11:55 PM]
The code only uses bubble sort for the incremental resorting of an already-sorted list. The initial sort is done with Arrays.sort() which is O(n*logn). The incremental resort is O(k*n) where k is the number of clauses that match the document last generated. Even if n is large, k will usually be small. Theoretically this is O(n^2) because k could be as high as n, but this is extremely unlikely especially when n is large. More likely is that k is bounded by a small constant, in which case the algorithm is O(n). It's like Quicksort in that regard -- there are outlier cases where it won't perform well, but it will perform better than most alternatives for the vast majority of cases.
Resorting the whole list every time would perform worse. The best algorithm would probably be to use the standard insert and delete operations on a heap (as in heap sort):
while top element generated last doc
heap remove it
generate it
heap insert it
This would yield total time O(k*logn), as with a PriorityQueue.
I don't think this is much of an issue to worry about, but the algorithm could be revised to use the heap sort operations if others think it is important.
Chuck
*********************************************************************/

View File

@ -0,0 +1,455 @@
package org.apache.lucene.search;
/**
* Copyright 2005 Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import junit.framework.TestCase;
import java.text.DecimalFormat;
/**
* Test of the DisjunctionMaxQuery.
*
*/
public class TestDisjunctionMaxQuery extends TestCase{
/** threshold for comparing floats */
public static final float SCORE_COMP_THRESH = 0.0000f;
/**
* Similarity to eliminate tf, idf and lengthNorm effects to
* isolate test case.
*
* <p>
* same as TestRankingSimilarity in TestRanking.zip from
* http://issues.apache.org/jira/browse/LUCENE-323
* </p>
* @author Williams
*/
private static class TestSimilarity extends DefaultSimilarity {
public TestSimilarity() {
}
public float tf(float freq) {
if (freq > 0.0f) return 1.0f;
else return 0.0f;
}
public float lengthNorm(String fieldName, int numTerms) {
return 1.0f;
}
public float idf(int docFreq, int numDocs) {
return 1.0f;
}
}
public Similarity sim = new TestSimilarity();
public Directory index;
public IndexReader r;
public IndexSearcher s;
public void setUp() throws Exception {
index = new RAMDirectory();
IndexWriter writer = new IndexWriter(index,
new WhitespaceAnalyzer(),
true);
writer.setSimilarity(sim);
// hed is the most important field, dek is secondary
// d1 is an "ok" match for: albino elephant
{
Document d1 = new Document();
d1.add(Field.Keyword("id", "d1"));
d1.add(Field.Text("hed", "elephant"));
d1.add(Field.Text("dek", "elephant"));
writer.addDocument(d1);
}
// d2 is a "good" match for: albino elephant
{
Document d2 = new Document();
d2.add(Field.Keyword("id", "d2"));
d2.add(Field.Text("hed", "elephant"));
d2.add(Field.Text("dek", "albino"));
d2.add(Field.Text("dek", "elephant"));
writer.addDocument(d2);
}
// d3 is a "better" match for: albino elephant
{
Document d3 = new Document();
d3.add(Field.Keyword("id", "d3"));
d3.add(Field.Text("hed", "albino"));
d3.add(Field.Text("hed", "elephant"));
writer.addDocument(d3);
}
// d4 is the "best" match for: albino elephant
{
Document d4 = new Document();
d4.add(Field.Keyword("id", "d4"));
d4.add(Field.Text("hed", "albino"));
d4.add(Field.Text("hed", "elephant"));
d4.add(Field.Text("dek", "albino"));
writer.addDocument(d4);
}
writer.close();
r = IndexReader.open(index);
s = new IndexSearcher(r);
s.setSimilarity(sim);
}
public void testSimpleEqualScores1() throws Exception {
DisjunctionMaxQuery q = new DisjunctionMaxQuery(0.0f);
q.add(tq("hed","albino"));
q.add(tq("hed","elephant"));
Hits h = s.search(q);
try {
assertEquals("all docs should match " + q.toString(),
4, h.length());
float score = h.score(0);
for (int i = 1; i < h.length(); i++) {
assertEquals("score #" + i + " is not the same",
score, h.score(i), SCORE_COMP_THRESH);
}
} catch (Error e) {
printHits("testSimpleEqualScores1",h);
throw e;
}
}
public void testSimpleEqualScores2() throws Exception {
DisjunctionMaxQuery q = new DisjunctionMaxQuery(0.0f);
q.add(tq("dek","albino"));
q.add(tq("dek","elephant"));
Hits h = s.search(q);
try {
assertEquals("3 docs should match " + q.toString(),
3, h.length());
float score = h.score(0);
for (int i = 1; i < h.length(); i++) {
assertEquals("score #" + i + " is not the same",
score, h.score(i), SCORE_COMP_THRESH);
}
} catch (Error e) {
printHits("testSimpleEqualScores2",h);
throw e;
}
}
public void testSimpleEqualScores3() throws Exception {
DisjunctionMaxQuery q = new DisjunctionMaxQuery(0.0f);
q.add(tq("hed","albino"));
q.add(tq("hed","elephant"));
q.add(tq("dek","albino"));
q.add(tq("dek","elephant"));
Hits h = s.search(q);
try {
assertEquals("all docs should match " + q.toString(),
4, h.length());
float score = h.score(0);
for (int i = 1; i < h.length(); i++) {
assertEquals("score #" + i + " is not the same",
score, h.score(i), SCORE_COMP_THRESH);
}
} catch (Error e) {
printHits("testSimpleEqualScores3",h);
throw e;
}
}
public void testSimpleTiebreaker() throws Exception {
DisjunctionMaxQuery q = new DisjunctionMaxQuery(0.01f);
q.add(tq("dek","albino"));
q.add(tq("dek","elephant"));
Hits h = s.search(q);
try {
assertEquals("3 docs should match " + q.toString(),
3, h.length());
assertEquals("wrong first", "d2", h.doc(0).get("id"));
float score0 = h.score(0);
float score1 = h.score(1);
float score2 = h.score(2);
assertTrue("d2 does not have better score then others: " +
score0 + " >? " + score1,
score0 > score1);
assertEquals("d4 and d1 don't have equal scores",
score1, score2, SCORE_COMP_THRESH);
} catch (Error e) {
printHits("testSimpleTiebreaker",h);
throw e;
}
}
public void testBooleanRequiredEqualScores() throws Exception {
BooleanQuery q = new BooleanQuery();
{
DisjunctionMaxQuery q1 = new DisjunctionMaxQuery(0.0f);
q1.add(tq("hed","albino"));
q1.add(tq("dek","albino"));
q.add(q1,true,false);
}
{
DisjunctionMaxQuery q2 = new DisjunctionMaxQuery(0.0f);
q2.add(tq("hed","elephant"));
q2.add(tq("dek","elephant"));
q.add(q2,true,false);
}
Hits h = s.search(q);
try {
assertEquals("3 docs should match " + q.toString(),
3, h.length());
float score = h.score(0);
for (int i = 1; i < h.length(); i++) {
assertEquals("score #" + i + " is not the same",
score, h.score(i), SCORE_COMP_THRESH);
}
} catch (Error e) {
printHits("testBooleanRequiredEqualScores1",h);
throw e;
}
}
public void testBooleanOptionalNoTiebreaker() throws Exception {
BooleanQuery q = new BooleanQuery();
{
DisjunctionMaxQuery q1 = new DisjunctionMaxQuery(0.0f);
q1.add(tq("hed","albino"));
q1.add(tq("dek","albino"));
q.add(q1,false,false);
}
{
DisjunctionMaxQuery q2 = new DisjunctionMaxQuery(0.0f);
q2.add(tq("hed","elephant"));
q2.add(tq("dek","elephant"));
q.add(q2,false,false);
}
Hits h = s.search(q);
try {
assertEquals("4 docs should match " + q.toString(),
4, h.length());
float score = h.score(0);
for (int i = 1; i < h.length()-1; i++) { /* note: -1 */
assertEquals("score #" + i + " is not the same",
score, h.score(i), SCORE_COMP_THRESH);
}
assertEquals("wrong last", "d1", h.doc(h.length()-1).get("id"));
float score1 = h.score(h.length()-1);
assertTrue("d1 does not have worse score then others: " +
score + " >? " + score1,
score > score1);
} catch (Error e) {
printHits("testBooleanOptionalNoTiebreaker",h);
throw e;
}
}
public void testBooleanOptionalWithTiebreaker() throws Exception {
BooleanQuery q = new BooleanQuery();
{
DisjunctionMaxQuery q1 = new DisjunctionMaxQuery(0.01f);
q1.add(tq("hed","albino"));
q1.add(tq("dek","albino"));
q.add(q1,false,false);
}
{
DisjunctionMaxQuery q2 = new DisjunctionMaxQuery(0.01f);
q2.add(tq("hed","elephant"));
q2.add(tq("dek","elephant"));
q.add(q2,false,false);
}
Hits h = s.search(q);
try {
assertEquals("4 docs should match " + q.toString(),
4, h.length());
float score0 = h.score(0);
float score1 = h.score(1);
float score2 = h.score(2);
float score3 = h.score(3);
String doc0 = h.doc(0).get("id");
String doc1 = h.doc(1).get("id");
String doc2 = h.doc(2).get("id");
String doc3 = h.doc(3).get("id");
assertTrue("doc0 should be d2 or d4: " + doc0,
doc0.equals("d2") || doc0.equals("d4"));
assertTrue("doc1 should be d2 or d4: " + doc0,
doc1.equals("d2") || doc1.equals("d4"));
assertEquals("score0 and score1 should match",
score0, score1, SCORE_COMP_THRESH);
assertEquals("wrong third", "d3", doc2);
assertTrue("d3 does not have worse score then d2 and d4: " +
score1 + " >? " + score2,
score1 > score2);
assertEquals("wrong fourth", "d1", doc3);
assertTrue("d1 does not have worse score then d3: " +
score2 + " >? " + score3,
score2 > score3);
} catch (Error e) {
printHits("testBooleanOptionalWithTiebreaker",h);
throw e;
}
}
public void testBooleanOptionalWithTiebreakerAndBoost() throws Exception {
BooleanQuery q = new BooleanQuery();
{
DisjunctionMaxQuery q1 = new DisjunctionMaxQuery(0.01f);
q1.add(tq("hed","albino", 1.5f));
q1.add(tq("dek","albino"));
q.add(q1,false,false);
}
{
DisjunctionMaxQuery q2 = new DisjunctionMaxQuery(0.01f);
q2.add(tq("hed","elephant", 1.5f));
q2.add(tq("dek","elephant"));
q.add(q2,false,false);
}
Hits h = s.search(q);
try {
assertEquals("4 docs should match " + q.toString(),
4, h.length());
float score0 = h.score(0);
float score1 = h.score(1);
float score2 = h.score(2);
float score3 = h.score(3);
String doc0 = h.doc(0).get("id");
String doc1 = h.doc(1).get("id");
String doc2 = h.doc(2).get("id");
String doc3 = h.doc(3).get("id");
assertEquals("doc0 should be d4: ", "d4", doc0);
assertEquals("doc1 should be d3: ", "d3", doc1);
assertEquals("doc2 should be d2: ", "d2", doc2);
assertEquals("doc3 should be d1: ", "d1", doc3);
assertTrue("d4 does not have a better score then d3: " +
score0 + " >? " + score1,
score0 > score1);
assertTrue("d3 does not have a better score then d2: " +
score1 + " >? " + score2,
score1 > score2);
assertTrue("d3 does not have a better score then d1: " +
score2 + " >? " + score3,
score2 > score3);
} catch (Error e) {
printHits("testBooleanOptionalWithTiebreakerAndBoost",h);
throw e;
}
}
/** macro */
protected Query tq(String f, String t) {
return new TermQuery(new Term(f, t));
}
/** macro */
protected Query tq(String f, String t, float b) {
Query q = tq(f,t);
q.setBoost(b);
return q;
}
protected void printHits(String test, Hits h) throws Exception {
System.err.println("------- " + test + " -------");
DecimalFormat f = new DecimalFormat("0.000000000");
for (int i = 0; i < h.length(); i++) {
Document d = h.doc(i);
float score = h.score(i);
System.err.println("#" + i + ": " + f.format(score) + " - " +
d.get("id"));
}
}
}