mirror of https://github.com/apache/lucene.git
new DisjunctionMaxQuery/DisjunctionMaxScorer from Chuck Williams: LUCENE-323
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@345089 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c40c7cd114
commit
3c10854e00
|
@ -183,6 +183,10 @@ New features
|
||||||
must match in a BooleanQuery. See BooleanQuery.setMinimumNumberShouldMatch().
|
must match in a BooleanQuery. See BooleanQuery.setMinimumNumberShouldMatch().
|
||||||
(Paul Elschot, Chris Hostetter via Yonik Seeley, LUCENE-395)
|
(Paul Elschot, Chris Hostetter via Yonik Seeley, LUCENE-395)
|
||||||
|
|
||||||
|
27. Added DisjunctionMaxQuery which provides the maximum score across it's clauses.
|
||||||
|
It's very useful for searching across multiple fields.
|
||||||
|
(Chuck Williams via Yonik Seeley, LUCENE-323)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
1. Several methods and fields have been deprecated. The API documentation
|
1. Several methods and fields have been deprecated. The API documentation
|
||||||
|
|
|
@ -0,0 +1,246 @@
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A query that generates the union of the documents produced by its subqueries, and that scores each document as the maximum
|
||||||
|
* score for that document produced by any subquery plus a tie breaking increment for any additional matching subqueries.
|
||||||
|
* This is useful to search for a word in multiple fields with different boost factors (so that the fields cannot be
|
||||||
|
* combined equivalently into a single search field). We want the primary score to be the one associated with the highest boost,
|
||||||
|
* not the sum of the field scores (as BooleanQuery would give).
|
||||||
|
* If the query is "albino elephant" this ensures that "albino" matching one field and "elephant" matching
|
||||||
|
* another gets a higher score than "albino" matching both fields.
|
||||||
|
* To get this result, use both BooleanQuery and DisjunctionMaxQuery: for each term a DisjunctionMaxQuery searches for it in
|
||||||
|
* each field, while the set of these DisjunctionMaxQuery's is combined into a BooleanQuery.
|
||||||
|
* The tie breaker capability allows results that include the same term in multiple fields to be judged better than results that
|
||||||
|
* include this term in only the best of those multiple fields, without confusing this with the better case of two different terms
|
||||||
|
* in the multiple fields.
|
||||||
|
* @author Chuck Williams
|
||||||
|
*/
|
||||||
|
public class DisjunctionMaxQuery extends Query implements Iterable {
|
||||||
|
|
||||||
|
/* The subqueries */
|
||||||
|
private ArrayList disjuncts = new ArrayList();
|
||||||
|
|
||||||
|
/* Multiple of the non-max disjunct scores added into our final score. Non-zero values support tie-breaking. */
|
||||||
|
private float tieBreakerMultiplier = 0.0f;
|
||||||
|
|
||||||
|
/** Creates a new empty DisjunctionMaxQuery. Use add() to add the subqueries.
|
||||||
|
* @param tieBreakerMultiplier this score of each non-maximum disjunct for a document is multiplied by this weight
|
||||||
|
* and added into the final score. If non-zero, the value should be small, on the order of 0.1, which says that
|
||||||
|
* 10 occurrences of word in a lower-scored field that is also in a higher scored field is just as good as a unique
|
||||||
|
* word in the lower scored field (i.e., one that is not in any higher scored field.
|
||||||
|
*/
|
||||||
|
public DisjunctionMaxQuery(float tieBreakerMultiplier) {
|
||||||
|
this.tieBreakerMultiplier = tieBreakerMultiplier;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new DisjunctionMaxQuery
|
||||||
|
* @param disjuncts an Iterable<Query> of all the disjuncts to add
|
||||||
|
* @param tieBreakerMultiplier the weight to give to each matching non-maximum disjunct
|
||||||
|
*/
|
||||||
|
public DisjunctionMaxQuery(Iterable disjuncts, float tieBreakerMultiplier) {
|
||||||
|
this.tieBreakerMultiplier = tieBreakerMultiplier;
|
||||||
|
add(disjuncts);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Add a subquery to this disjunction
|
||||||
|
* @param query the disjunct added
|
||||||
|
*/
|
||||||
|
public void add(Query query) {
|
||||||
|
disjuncts.add(query);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Add a collection of disjuncts to this disjunction
|
||||||
|
* via Iterable<Query>
|
||||||
|
*/
|
||||||
|
public void add(Iterable disjuncts) {
|
||||||
|
Iterator i = disjuncts.iterator();
|
||||||
|
while (i.hasNext()) add((Query)i.next());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** An Iterator<Query> over the disjuncts */
|
||||||
|
public Iterator iterator() {
|
||||||
|
return disjuncts.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The Weight for DisjunctionMaxQuery's, used to normalize, score and explain these queries */
|
||||||
|
private class DisjunctionMaxWeight implements Weight {
|
||||||
|
|
||||||
|
private Searcher searcher; // The searcher with which we are associated.
|
||||||
|
private ArrayList weights = new ArrayList(); // The Weight's for our subqueries, in 1-1 correspondence with disjuncts
|
||||||
|
|
||||||
|
/* Construct the Weight for this Query searched by searcher. Recursively construct subquery weights. */
|
||||||
|
public DisjunctionMaxWeight(Searcher searcher) throws IOException {
|
||||||
|
this.searcher = searcher;
|
||||||
|
for (int i = 0; i < disjuncts.size(); i++)
|
||||||
|
weights.add(((Query) disjuncts.get(i)).createWeight(searcher));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Return our associated DisjunctionMaxQuery */
|
||||||
|
public Query getQuery() { return DisjunctionMaxQuery.this; }
|
||||||
|
|
||||||
|
/* Return our boost */
|
||||||
|
public float getValue() { return getBoost(); }
|
||||||
|
|
||||||
|
/* Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */
|
||||||
|
public float sumOfSquaredWeights() throws IOException {
|
||||||
|
float max = 0.0f, sum = 0.0f;
|
||||||
|
for (int i = 0; i < weights.size(); i++) {
|
||||||
|
float sub = ((Weight) weights.get(i)).sumOfSquaredWeights();
|
||||||
|
sum += sub;
|
||||||
|
max = Math.max(max, sub);
|
||||||
|
}
|
||||||
|
return (((sum - max) * tieBreakerMultiplier * tieBreakerMultiplier) + max) * getBoost() * getBoost();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Apply the computed normalization factor to our subqueries */
|
||||||
|
public void normalize(float norm) {
|
||||||
|
norm *= getBoost(); // Incorporate our boost
|
||||||
|
for (int i = 0 ; i < weights.size(); i++)
|
||||||
|
((Weight) weights.get(i)).normalize(norm);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Create the scorer used to score our associated DisjunctionMaxQuery */
|
||||||
|
public Scorer scorer(IndexReader reader) throws IOException {
|
||||||
|
DisjunctionMaxScorer result = new DisjunctionMaxScorer(tieBreakerMultiplier, getSimilarity(searcher));
|
||||||
|
for (int i = 0 ; i < weights.size(); i++) {
|
||||||
|
Weight w = (Weight) weights.get(i);
|
||||||
|
Scorer subScorer = w.scorer(reader);
|
||||||
|
if (subScorer == null) return null;
|
||||||
|
result.add(subScorer);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Explain the score we computed for doc */
|
||||||
|
public Explanation explain(IndexReader reader, int doc) throws IOException {
|
||||||
|
if ( disjuncts.size() == 1) return ((Weight) weights.get(0)).explain(reader,doc);
|
||||||
|
Explanation result = new Explanation();
|
||||||
|
float max = 0.0f, sum = 0.0f;
|
||||||
|
result.setDescription(tieBreakerMultiplier == 0.0f ? "max of:" : "max plus " + tieBreakerMultiplier + " times others of:");
|
||||||
|
for (int i = 0 ; i < weights.size(); i++) {
|
||||||
|
Explanation e = ((Weight) weights.get(i)).explain(reader, doc);
|
||||||
|
if (e.getValue() > 0) {
|
||||||
|
result.addDetail(e);
|
||||||
|
sum += e.getValue();
|
||||||
|
max = Math.max(max, e.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.setValue(max + (sum - max)*tieBreakerMultiplier);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // end of DisjunctionMaxWeight inner class
|
||||||
|
|
||||||
|
/* Create the Weight used to score us */
|
||||||
|
protected Weight createWeight(Searcher searcher) throws IOException {
|
||||||
|
return new DisjunctionMaxWeight(searcher);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Optimize our representation and our subqueries representations
|
||||||
|
* @param reader the IndexReader we query
|
||||||
|
* @return an optimized copy of us (which may not be a copy if there is nothing to optimize) */
|
||||||
|
public Query rewrite(IndexReader reader) throws IOException {
|
||||||
|
if (disjuncts.size() == 1) {
|
||||||
|
Query singleton = (Query) disjuncts.get(0);
|
||||||
|
Query result = singleton.rewrite(reader);
|
||||||
|
if (getBoost() != 1.0f) {
|
||||||
|
if (result == singleton) result = (Query)result.clone();
|
||||||
|
result.setBoost(getBoost() * result.getBoost());
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
DisjunctionMaxQuery clone = null;
|
||||||
|
for (int i = 0 ; i < disjuncts.size(); i++) {
|
||||||
|
Query clause = (Query) disjuncts.get(i);
|
||||||
|
Query rewrite = clause.rewrite(reader);
|
||||||
|
if (rewrite != clause) {
|
||||||
|
if (clone == null) clone = (DisjunctionMaxQuery)this.clone();
|
||||||
|
clone.disjuncts.set(i, rewrite);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (clone != null) return clone;
|
||||||
|
else return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Create a shallow copy of us -- used in rewriting if necessary
|
||||||
|
* @return a copy of us (but reuse, don't copy, our subqueries) */
|
||||||
|
public Object clone() {
|
||||||
|
DisjunctionMaxQuery clone = (DisjunctionMaxQuery)super.clone();
|
||||||
|
clone.disjuncts = (ArrayList)this.disjuncts.clone();
|
||||||
|
return clone;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Prettyprint us.
|
||||||
|
* @param field the field to which we are applied
|
||||||
|
* @return a string that shows what we do, of the form "(disjunct1 | disjunct2 | ... | disjunctn)^boost"
|
||||||
|
*/
|
||||||
|
public String toString(String field) {
|
||||||
|
StringBuffer buffer = new StringBuffer();
|
||||||
|
buffer.append("(");
|
||||||
|
for (int i = 0 ; i < disjuncts.size(); i++) {
|
||||||
|
Query subquery = (Query) disjuncts.get(i);
|
||||||
|
if (subquery instanceof BooleanQuery) { // wrap sub-bools in parens
|
||||||
|
buffer.append("(");
|
||||||
|
buffer.append(subquery.toString(field));
|
||||||
|
buffer.append(")");
|
||||||
|
}
|
||||||
|
else buffer.append(subquery.toString(field));
|
||||||
|
if (i != disjuncts.size()-1) buffer.append(" | ");
|
||||||
|
}
|
||||||
|
buffer.append(")");
|
||||||
|
if (tieBreakerMultiplier != 0.0f) {
|
||||||
|
buffer.append("~");
|
||||||
|
buffer.append(tieBreakerMultiplier);
|
||||||
|
}
|
||||||
|
if (getBoost() != 1.0) {
|
||||||
|
buffer.append("^");
|
||||||
|
buffer.append(getBoost());
|
||||||
|
}
|
||||||
|
return buffer.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return true iff we represent the same query as o
|
||||||
|
* @param o another object
|
||||||
|
* @return true iff o is a DisjunctionMaxQuery with the same boost and the same subqueries, in the same order, as us
|
||||||
|
*/
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (! (o instanceof DisjunctionMaxQuery) ) return false;
|
||||||
|
DisjunctionMaxQuery other = (DisjunctionMaxQuery)o;
|
||||||
|
return this.getBoost() == other.getBoost()
|
||||||
|
&& this.tieBreakerMultiplier == other.tieBreakerMultiplier
|
||||||
|
&& this.disjuncts.equals(other.disjuncts);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Compute a hash code for hashing us
|
||||||
|
* @return the hash code
|
||||||
|
*/
|
||||||
|
public int hashCode() {
|
||||||
|
return Float.floatToIntBits(getBoost())
|
||||||
|
+ Float.floatToIntBits(tieBreakerMultiplier)
|
||||||
|
+ disjuncts.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,214 @@
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The Scorer for DisjunctionMaxQuery's. The union of all documents generated by the the subquery scorers
|
||||||
|
* is generated in document number order. The score for each document is the maximum of the scores computed
|
||||||
|
* by the subquery scorers that generate that document, plus tieBreakerMultiplier times the sum of the scores
|
||||||
|
* for the other subqueries that generate the document.
|
||||||
|
* @author Chuck Williams
|
||||||
|
*/
|
||||||
|
class DisjunctionMaxScorer extends Scorer {
|
||||||
|
|
||||||
|
/* The scorers for subqueries that have remaining docs, kept sorted by number of next doc. */
|
||||||
|
private ArrayList subScorers = new ArrayList();
|
||||||
|
|
||||||
|
/* Multiplier applied to non-maximum-scoring subqueries for a document as they are summed into the result. */
|
||||||
|
private float tieBreakerMultiplier;
|
||||||
|
|
||||||
|
private boolean more = false; // True iff there is a next document
|
||||||
|
private boolean firstTime = true; // True iff next() has not yet been called
|
||||||
|
|
||||||
|
/* Comparator to sort subScorers according to the document number of next document */
|
||||||
|
private static class DisjunctionMaxClauseComparator implements Comparator {
|
||||||
|
|
||||||
|
/* Scorers have all been positioned at their next document already */
|
||||||
|
public int compare(Object o1, Object o2) {
|
||||||
|
if (o1 instanceof Scorer && o2 instanceof Scorer) {
|
||||||
|
Scorer s1 = (Scorer) o1;
|
||||||
|
Scorer s2 = (Scorer) o2;
|
||||||
|
|
||||||
|
return s1.doc() - s2.doc();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new ClassCastException("Objects not of the type 'Scorer'");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Compatible equality */
|
||||||
|
public boolean equals(Scorer s1, Scorer s2) {
|
||||||
|
return s1.doc() == s2.doc();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Fixed instance of the comparator to reuse */
|
||||||
|
private static DisjunctionMaxClauseComparator subScorerComparator = new DisjunctionMaxClauseComparator();
|
||||||
|
|
||||||
|
/** Creates a new instance of DisjunctionMaxScorer
|
||||||
|
* @param similarity -- not used since our definition involves neither coord nor terms directly */
|
||||||
|
public DisjunctionMaxScorer(float tieBreakerMultiplier, Similarity similarity) {
|
||||||
|
super(similarity);
|
||||||
|
this.tieBreakerMultiplier = tieBreakerMultiplier;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Add the scorer for a subquery
|
||||||
|
* @param scorer the scorer of a subquery of our associated DisjunctionMaxQuery
|
||||||
|
*/
|
||||||
|
public void add(Scorer scorer) throws IOException {
|
||||||
|
if ( scorer.next() ) { // Initialize and retain only if it produces docs
|
||||||
|
subScorers.add(scorer);
|
||||||
|
more = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* First time initialization. Sort subScorers. */
|
||||||
|
private void init() {
|
||||||
|
sortSubScorers();
|
||||||
|
firstTime = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Sort subScorers in order of document number of next document to be generated */
|
||||||
|
private void sortSubScorers() {
|
||||||
|
Scorer[] sorted = (Scorer[]) subScorers.toArray(new Scorer[subScorers.size()]);
|
||||||
|
Arrays.sort(sorted, subScorerComparator);
|
||||||
|
for (int i=0; i<sorted.length; i++) subScorers.set(i, sorted[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Generate the next document matching our associated DisjunctionMaxQuery.
|
||||||
|
* @return true iff there is a next document
|
||||||
|
*/
|
||||||
|
public boolean next() throws IOException {
|
||||||
|
if ( !more ) return false;
|
||||||
|
if ( firstTime ) {
|
||||||
|
init();
|
||||||
|
return true; // more would have been false if no subScorers had any docs
|
||||||
|
}
|
||||||
|
// Increment all generators that generated the last doc and incrementally re-sort.
|
||||||
|
int lastdoc = ((Scorer) subScorers.get(0)).doc();
|
||||||
|
do {
|
||||||
|
if ( ((Scorer) subScorers.get(0)).next() ) {
|
||||||
|
Scorer s = (Scorer) subScorers.get(0);
|
||||||
|
int snextdoc = s.doc(), i=1;
|
||||||
|
for (; i<subScorers.size() && snextdoc > ((Scorer) subScorers.get(i)).doc(); i++)
|
||||||
|
subScorers.set(i-1, subScorers.get(i));
|
||||||
|
if ( i!=1 ) subScorers.set(i-1, s);
|
||||||
|
} else {
|
||||||
|
subScorers.remove(0);
|
||||||
|
if ( subScorers.isEmpty() ) return (more = false);
|
||||||
|
}
|
||||||
|
} while ( ((Scorer) subScorers.get(0)).doc()==lastdoc );
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Determine the current document number. Initially invalid, until {@link #next()} is called the first time.
|
||||||
|
* @return the document number of the currently generated document
|
||||||
|
*/
|
||||||
|
public int doc() {
|
||||||
|
return ((Scorer) subScorers.get(0)).doc();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Determine the current document score. Initially invalid, until {@link #next()} is called the first time.
|
||||||
|
* @return the score of the current generated document
|
||||||
|
*/
|
||||||
|
public float score() throws IOException {
|
||||||
|
float max = ((Scorer) subScorers.get(0)).score(), sum = max;
|
||||||
|
for (int i = 1, doc = ((Scorer) subScorers.get(0)).doc(); i < subScorers.size() && ((Scorer) subScorers.get(i)).doc() == doc; i++) {
|
||||||
|
float sub = ((Scorer) subScorers.get(i)).score();
|
||||||
|
sum += sub;
|
||||||
|
max = Math.max(max, sub);
|
||||||
|
}
|
||||||
|
return max + (sum - max)*tieBreakerMultiplier;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Advance to the first document beyond the current whose number is greater than or equal to target.
|
||||||
|
* @param target the minimum number of the next desired document
|
||||||
|
* @return true iff there is a document to be generated whose number is at least target
|
||||||
|
*/
|
||||||
|
public boolean skipTo(int target) throws IOException {
|
||||||
|
int i=0;
|
||||||
|
while ( i<subScorers.size() ) {
|
||||||
|
if ( ((Scorer) subScorers.get(i)).doc() < target ) {
|
||||||
|
if ( ((Scorer) subScorers.get(i)).skipTo(target) ) i++;
|
||||||
|
else subScorers.remove(i);
|
||||||
|
} else i++;
|
||||||
|
}
|
||||||
|
if ( i == 0 ) return false;
|
||||||
|
sortSubScorers();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Explain a score that we computed. UNSUPPORTED -- see explanation capability in DisjunctionMaxQuery.
|
||||||
|
* @param doc the number of a document we scored
|
||||||
|
* @return the Explanation for our score
|
||||||
|
*/
|
||||||
|
public Explanation explain(int doc) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/***************************************************************************
|
||||||
|
Implementation notes from http://issues.apache.org/jira/browse/LUCENE-323
|
||||||
|
|
||||||
|
|
||||||
|
There is an issue with the MaxDisjunctionScorer in the .zip attachment, I'm
|
||||||
|
sorry I did not see this earlier when I posted on java-dev about this.
|
||||||
|
|
||||||
|
The problem is that MaxDisjunctionScorer uses bubble sort to keep the subscorer
|
||||||
|
sorted over the documents in the next() method (line 103), and this does not scale nicely
|
||||||
|
when the number of subscorers increases.
|
||||||
|
Supposing the number of subscores that match the document is N,
|
||||||
|
the amount of work to be done is proportional to (N*N) per document.
|
||||||
|
In DisjunctionSumScorer a priority queue is used, and there the amount of work is
|
||||||
|
proportional to (N log(N)) per document.
|
||||||
|
So I would recommend to rewrite MaxDisjunctionScorer to inherit from a new common
|
||||||
|
super class with DisjunctionSumScorer, sharing everything except the
|
||||||
|
advanceAfterCurrent() method (which could be abstract in the new superclass).
|
||||||
|
It's possible to be more aggressive in refactoring by initializing and adapting
|
||||||
|
the score per index document using different methods, but this would take N
|
||||||
|
extra method calls per document.
|
||||||
|
|
||||||
|
At the same time the name could be changed to DisjunctionMaxScorer
|
||||||
|
for consistency in the org.lucene.search package.
|
||||||
|
|
||||||
|
Regards,
|
||||||
|
Paul Elschot
|
||||||
|
|
||||||
|
Comment by Chuck Williams [14/Nov/05 11:55 PM]
|
||||||
|
The code only uses bubble sort for the incremental resorting of an already-sorted list. The initial sort is done with Arrays.sort() which is O(n*logn). The incremental resort is O(k*n) where k is the number of clauses that match the document last generated. Even if n is large, k will usually be small. Theoretically this is O(n^2) because k could be as high as n, but this is extremely unlikely especially when n is large. More likely is that k is bounded by a small constant, in which case the algorithm is O(n). It's like Quicksort in that regard -- there are outlier cases where it won't perform well, but it will perform better than most alternatives for the vast majority of cases.
|
||||||
|
|
||||||
|
Resorting the whole list every time would perform worse. The best algorithm would probably be to use the standard insert and delete operations on a heap (as in heap sort):
|
||||||
|
|
||||||
|
while top element generated last doc
|
||||||
|
heap remove it
|
||||||
|
generate it
|
||||||
|
heap insert it
|
||||||
|
|
||||||
|
This would yield total time O(k*logn), as with a PriorityQueue.
|
||||||
|
|
||||||
|
I don't think this is much of an issue to worry about, but the algorithm could be revised to use the heap sort operations if others think it is important.
|
||||||
|
|
||||||
|
Chuck
|
||||||
|
|
||||||
|
*********************************************************************/
|
|
@ -0,0 +1,455 @@
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2005 Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
|
||||||
|
import org.apache.lucene.search.Hits;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.Similarity;
|
||||||
|
import org.apache.lucene.search.DefaultSimilarity;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import java.text.DecimalFormat;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test of the DisjunctionMaxQuery.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class TestDisjunctionMaxQuery extends TestCase{
|
||||||
|
|
||||||
|
/** threshold for comparing floats */
|
||||||
|
public static final float SCORE_COMP_THRESH = 0.0000f;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Similarity to eliminate tf, idf and lengthNorm effects to
|
||||||
|
* isolate test case.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* same as TestRankingSimilarity in TestRanking.zip from
|
||||||
|
* http://issues.apache.org/jira/browse/LUCENE-323
|
||||||
|
* </p>
|
||||||
|
* @author Williams
|
||||||
|
*/
|
||||||
|
private static class TestSimilarity extends DefaultSimilarity {
|
||||||
|
|
||||||
|
public TestSimilarity() {
|
||||||
|
}
|
||||||
|
public float tf(float freq) {
|
||||||
|
if (freq > 0.0f) return 1.0f;
|
||||||
|
else return 0.0f;
|
||||||
|
}
|
||||||
|
public float lengthNorm(String fieldName, int numTerms) {
|
||||||
|
return 1.0f;
|
||||||
|
}
|
||||||
|
public float idf(int docFreq, int numDocs) {
|
||||||
|
return 1.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Similarity sim = new TestSimilarity();
|
||||||
|
public Directory index;
|
||||||
|
public IndexReader r;
|
||||||
|
public IndexSearcher s;
|
||||||
|
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
|
||||||
|
index = new RAMDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(index,
|
||||||
|
new WhitespaceAnalyzer(),
|
||||||
|
true);
|
||||||
|
writer.setSimilarity(sim);
|
||||||
|
|
||||||
|
// hed is the most important field, dek is secondary
|
||||||
|
|
||||||
|
// d1 is an "ok" match for: albino elephant
|
||||||
|
{
|
||||||
|
Document d1 = new Document();
|
||||||
|
d1.add(Field.Keyword("id", "d1"));
|
||||||
|
d1.add(Field.Text("hed", "elephant"));
|
||||||
|
d1.add(Field.Text("dek", "elephant"));
|
||||||
|
writer.addDocument(d1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// d2 is a "good" match for: albino elephant
|
||||||
|
{
|
||||||
|
Document d2 = new Document();
|
||||||
|
d2.add(Field.Keyword("id", "d2"));
|
||||||
|
d2.add(Field.Text("hed", "elephant"));
|
||||||
|
d2.add(Field.Text("dek", "albino"));
|
||||||
|
d2.add(Field.Text("dek", "elephant"));
|
||||||
|
writer.addDocument(d2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// d3 is a "better" match for: albino elephant
|
||||||
|
{
|
||||||
|
Document d3 = new Document();
|
||||||
|
d3.add(Field.Keyword("id", "d3"));
|
||||||
|
d3.add(Field.Text("hed", "albino"));
|
||||||
|
d3.add(Field.Text("hed", "elephant"));
|
||||||
|
writer.addDocument(d3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// d4 is the "best" match for: albino elephant
|
||||||
|
{
|
||||||
|
Document d4 = new Document();
|
||||||
|
d4.add(Field.Keyword("id", "d4"));
|
||||||
|
d4.add(Field.Text("hed", "albino"));
|
||||||
|
d4.add(Field.Text("hed", "elephant"));
|
||||||
|
d4.add(Field.Text("dek", "albino"));
|
||||||
|
writer.addDocument(d4);
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
r = IndexReader.open(index);
|
||||||
|
s = new IndexSearcher(r);
|
||||||
|
s.setSimilarity(sim);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testSimpleEqualScores1() throws Exception {
|
||||||
|
|
||||||
|
DisjunctionMaxQuery q = new DisjunctionMaxQuery(0.0f);
|
||||||
|
q.add(tq("hed","albino"));
|
||||||
|
q.add(tq("hed","elephant"));
|
||||||
|
|
||||||
|
Hits h = s.search(q);
|
||||||
|
|
||||||
|
try {
|
||||||
|
assertEquals("all docs should match " + q.toString(),
|
||||||
|
4, h.length());
|
||||||
|
|
||||||
|
float score = h.score(0);
|
||||||
|
for (int i = 1; i < h.length(); i++) {
|
||||||
|
assertEquals("score #" + i + " is not the same",
|
||||||
|
score, h.score(i), SCORE_COMP_THRESH);
|
||||||
|
}
|
||||||
|
} catch (Error e) {
|
||||||
|
printHits("testSimpleEqualScores1",h);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSimpleEqualScores2() throws Exception {
|
||||||
|
|
||||||
|
DisjunctionMaxQuery q = new DisjunctionMaxQuery(0.0f);
|
||||||
|
q.add(tq("dek","albino"));
|
||||||
|
q.add(tq("dek","elephant"));
|
||||||
|
|
||||||
|
Hits h = s.search(q);
|
||||||
|
|
||||||
|
try {
|
||||||
|
assertEquals("3 docs should match " + q.toString(),
|
||||||
|
3, h.length());
|
||||||
|
float score = h.score(0);
|
||||||
|
for (int i = 1; i < h.length(); i++) {
|
||||||
|
assertEquals("score #" + i + " is not the same",
|
||||||
|
score, h.score(i), SCORE_COMP_THRESH);
|
||||||
|
}
|
||||||
|
} catch (Error e) {
|
||||||
|
printHits("testSimpleEqualScores2",h);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSimpleEqualScores3() throws Exception {
|
||||||
|
|
||||||
|
DisjunctionMaxQuery q = new DisjunctionMaxQuery(0.0f);
|
||||||
|
q.add(tq("hed","albino"));
|
||||||
|
q.add(tq("hed","elephant"));
|
||||||
|
q.add(tq("dek","albino"));
|
||||||
|
q.add(tq("dek","elephant"));
|
||||||
|
|
||||||
|
Hits h = s.search(q);
|
||||||
|
|
||||||
|
try {
|
||||||
|
assertEquals("all docs should match " + q.toString(),
|
||||||
|
4, h.length());
|
||||||
|
float score = h.score(0);
|
||||||
|
for (int i = 1; i < h.length(); i++) {
|
||||||
|
assertEquals("score #" + i + " is not the same",
|
||||||
|
score, h.score(i), SCORE_COMP_THRESH);
|
||||||
|
}
|
||||||
|
} catch (Error e) {
|
||||||
|
printHits("testSimpleEqualScores3",h);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSimpleTiebreaker() throws Exception {
|
||||||
|
|
||||||
|
DisjunctionMaxQuery q = new DisjunctionMaxQuery(0.01f);
|
||||||
|
q.add(tq("dek","albino"));
|
||||||
|
q.add(tq("dek","elephant"));
|
||||||
|
|
||||||
|
Hits h = s.search(q);
|
||||||
|
|
||||||
|
try {
|
||||||
|
assertEquals("3 docs should match " + q.toString(),
|
||||||
|
3, h.length());
|
||||||
|
assertEquals("wrong first", "d2", h.doc(0).get("id"));
|
||||||
|
float score0 = h.score(0);
|
||||||
|
float score1 = h.score(1);
|
||||||
|
float score2 = h.score(2);
|
||||||
|
assertTrue("d2 does not have better score then others: " +
|
||||||
|
score0 + " >? " + score1,
|
||||||
|
score0 > score1);
|
||||||
|
assertEquals("d4 and d1 don't have equal scores",
|
||||||
|
score1, score2, SCORE_COMP_THRESH);
|
||||||
|
} catch (Error e) {
|
||||||
|
printHits("testSimpleTiebreaker",h);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBooleanRequiredEqualScores() throws Exception {
|
||||||
|
|
||||||
|
BooleanQuery q = new BooleanQuery();
|
||||||
|
{
|
||||||
|
DisjunctionMaxQuery q1 = new DisjunctionMaxQuery(0.0f);
|
||||||
|
q1.add(tq("hed","albino"));
|
||||||
|
q1.add(tq("dek","albino"));
|
||||||
|
q.add(q1,true,false);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
DisjunctionMaxQuery q2 = new DisjunctionMaxQuery(0.0f);
|
||||||
|
q2.add(tq("hed","elephant"));
|
||||||
|
q2.add(tq("dek","elephant"));
|
||||||
|
q.add(q2,true,false);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Hits h = s.search(q);
|
||||||
|
|
||||||
|
try {
|
||||||
|
assertEquals("3 docs should match " + q.toString(),
|
||||||
|
3, h.length());
|
||||||
|
float score = h.score(0);
|
||||||
|
for (int i = 1; i < h.length(); i++) {
|
||||||
|
assertEquals("score #" + i + " is not the same",
|
||||||
|
score, h.score(i), SCORE_COMP_THRESH);
|
||||||
|
}
|
||||||
|
} catch (Error e) {
|
||||||
|
printHits("testBooleanRequiredEqualScores1",h);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testBooleanOptionalNoTiebreaker() throws Exception {
|
||||||
|
|
||||||
|
BooleanQuery q = new BooleanQuery();
|
||||||
|
{
|
||||||
|
DisjunctionMaxQuery q1 = new DisjunctionMaxQuery(0.0f);
|
||||||
|
q1.add(tq("hed","albino"));
|
||||||
|
q1.add(tq("dek","albino"));
|
||||||
|
q.add(q1,false,false);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
DisjunctionMaxQuery q2 = new DisjunctionMaxQuery(0.0f);
|
||||||
|
q2.add(tq("hed","elephant"));
|
||||||
|
q2.add(tq("dek","elephant"));
|
||||||
|
q.add(q2,false,false);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Hits h = s.search(q);
|
||||||
|
|
||||||
|
try {
|
||||||
|
assertEquals("4 docs should match " + q.toString(),
|
||||||
|
4, h.length());
|
||||||
|
float score = h.score(0);
|
||||||
|
for (int i = 1; i < h.length()-1; i++) { /* note: -1 */
|
||||||
|
assertEquals("score #" + i + " is not the same",
|
||||||
|
score, h.score(i), SCORE_COMP_THRESH);
|
||||||
|
}
|
||||||
|
assertEquals("wrong last", "d1", h.doc(h.length()-1).get("id"));
|
||||||
|
float score1 = h.score(h.length()-1);
|
||||||
|
assertTrue("d1 does not have worse score then others: " +
|
||||||
|
score + " >? " + score1,
|
||||||
|
score > score1);
|
||||||
|
} catch (Error e) {
|
||||||
|
printHits("testBooleanOptionalNoTiebreaker",h);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testBooleanOptionalWithTiebreaker() throws Exception {
|
||||||
|
|
||||||
|
BooleanQuery q = new BooleanQuery();
|
||||||
|
{
|
||||||
|
DisjunctionMaxQuery q1 = new DisjunctionMaxQuery(0.01f);
|
||||||
|
q1.add(tq("hed","albino"));
|
||||||
|
q1.add(tq("dek","albino"));
|
||||||
|
q.add(q1,false,false);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
DisjunctionMaxQuery q2 = new DisjunctionMaxQuery(0.01f);
|
||||||
|
q2.add(tq("hed","elephant"));
|
||||||
|
q2.add(tq("dek","elephant"));
|
||||||
|
q.add(q2,false,false);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Hits h = s.search(q);
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
assertEquals("4 docs should match " + q.toString(),
|
||||||
|
4, h.length());
|
||||||
|
|
||||||
|
float score0 = h.score(0);
|
||||||
|
float score1 = h.score(1);
|
||||||
|
float score2 = h.score(2);
|
||||||
|
float score3 = h.score(3);
|
||||||
|
|
||||||
|
String doc0 = h.doc(0).get("id");
|
||||||
|
String doc1 = h.doc(1).get("id");
|
||||||
|
String doc2 = h.doc(2).get("id");
|
||||||
|
String doc3 = h.doc(3).get("id");
|
||||||
|
|
||||||
|
assertTrue("doc0 should be d2 or d4: " + doc0,
|
||||||
|
doc0.equals("d2") || doc0.equals("d4"));
|
||||||
|
assertTrue("doc1 should be d2 or d4: " + doc0,
|
||||||
|
doc1.equals("d2") || doc1.equals("d4"));
|
||||||
|
assertEquals("score0 and score1 should match",
|
||||||
|
score0, score1, SCORE_COMP_THRESH);
|
||||||
|
assertEquals("wrong third", "d3", doc2);
|
||||||
|
assertTrue("d3 does not have worse score then d2 and d4: " +
|
||||||
|
score1 + " >? " + score2,
|
||||||
|
score1 > score2);
|
||||||
|
|
||||||
|
assertEquals("wrong fourth", "d1", doc3);
|
||||||
|
assertTrue("d1 does not have worse score then d3: " +
|
||||||
|
score2 + " >? " + score3,
|
||||||
|
score2 > score3);
|
||||||
|
|
||||||
|
} catch (Error e) {
|
||||||
|
printHits("testBooleanOptionalWithTiebreaker",h);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testBooleanOptionalWithTiebreakerAndBoost() throws Exception {
|
||||||
|
|
||||||
|
BooleanQuery q = new BooleanQuery();
|
||||||
|
{
|
||||||
|
DisjunctionMaxQuery q1 = new DisjunctionMaxQuery(0.01f);
|
||||||
|
q1.add(tq("hed","albino", 1.5f));
|
||||||
|
q1.add(tq("dek","albino"));
|
||||||
|
q.add(q1,false,false);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
DisjunctionMaxQuery q2 = new DisjunctionMaxQuery(0.01f);
|
||||||
|
q2.add(tq("hed","elephant", 1.5f));
|
||||||
|
q2.add(tq("dek","elephant"));
|
||||||
|
q.add(q2,false,false);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Hits h = s.search(q);
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
assertEquals("4 docs should match " + q.toString(),
|
||||||
|
4, h.length());
|
||||||
|
|
||||||
|
float score0 = h.score(0);
|
||||||
|
float score1 = h.score(1);
|
||||||
|
float score2 = h.score(2);
|
||||||
|
float score3 = h.score(3);
|
||||||
|
|
||||||
|
String doc0 = h.doc(0).get("id");
|
||||||
|
String doc1 = h.doc(1).get("id");
|
||||||
|
String doc2 = h.doc(2).get("id");
|
||||||
|
String doc3 = h.doc(3).get("id");
|
||||||
|
|
||||||
|
assertEquals("doc0 should be d4: ", "d4", doc0);
|
||||||
|
assertEquals("doc1 should be d3: ", "d3", doc1);
|
||||||
|
assertEquals("doc2 should be d2: ", "d2", doc2);
|
||||||
|
assertEquals("doc3 should be d1: ", "d1", doc3);
|
||||||
|
|
||||||
|
assertTrue("d4 does not have a better score then d3: " +
|
||||||
|
score0 + " >? " + score1,
|
||||||
|
score0 > score1);
|
||||||
|
assertTrue("d3 does not have a better score then d2: " +
|
||||||
|
score1 + " >? " + score2,
|
||||||
|
score1 > score2);
|
||||||
|
assertTrue("d3 does not have a better score then d1: " +
|
||||||
|
score2 + " >? " + score3,
|
||||||
|
score2 > score3);
|
||||||
|
|
||||||
|
} catch (Error e) {
|
||||||
|
printHits("testBooleanOptionalWithTiebreakerAndBoost",h);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/** macro */
|
||||||
|
protected Query tq(String f, String t) {
|
||||||
|
return new TermQuery(new Term(f, t));
|
||||||
|
}
|
||||||
|
/** macro */
|
||||||
|
protected Query tq(String f, String t, float b) {
|
||||||
|
Query q = tq(f,t);
|
||||||
|
q.setBoost(b);
|
||||||
|
return q;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected void printHits(String test, Hits h) throws Exception {
|
||||||
|
|
||||||
|
System.err.println("------- " + test + " -------");
|
||||||
|
|
||||||
|
DecimalFormat f = new DecimalFormat("0.000000000");
|
||||||
|
|
||||||
|
for (int i = 0; i < h.length(); i++) {
|
||||||
|
Document d = h.doc(i);
|
||||||
|
float score = h.score(i);
|
||||||
|
System.err.println("#" + i + ": " + f.format(score) + " - " +
|
||||||
|
d.get("id"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue