LUCENE-504: Change FuzzyQuery to use java.utilPriorityQueue which grows dynamically to support BooleanQuery.maxClauseCount(Integer.MAX_VALUE) without exhausting all memory.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@833544 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2009-11-06 20:15:23 +00:00
parent 53b807726a
commit 15743fc179
2 changed files with 62 additions and 50 deletions

View File

@ -19,10 +19,10 @@ package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
import java.util.PriorityQueue;
/** Implements the fuzzy search query. The similarity measurement
* is based on the Levenshtein (edit distance) algorithm.
@ -132,40 +132,40 @@ public class FuzzyQuery extends MultiTermQuery {
return new TermQuery(term);
}
int maxSize = BooleanQuery.getMaxClauseCount();
PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(1024);
FilteredTermEnum enumerator = getEnum(reader);
int maxClauseCount = BooleanQuery.getMaxClauseCount();
ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
ScoreTerm reusableST = null;
try {
ScoreTerm bottomSt = null;
do {
float score = 0.0f;
Term t = enumerator.term();
if (t != null) {
score = enumerator.difference();
if (reusableST == null) {
reusableST = new ScoreTerm(t, score);
} else if (score >= reusableST.score) {
// reusableST holds the last "rejected" entry, so, if
// this new score is not better than that, there's no
// need to try inserting it
reusableST.score = score;
reusableST.term = t;
} else {
continue;
final Term t = enumerator.term();
if (t == null) break;
ScoreTerm st = new ScoreTerm(t, enumerator.difference());
if (stQueue.size() < maxSize) {
// record the current bottom item
if (bottomSt == null || st.compareTo(bottomSt) > 0) {
bottomSt = st;
}
// add to PQ, as it is not yet filled up
stQueue.offer(st);
} else {
assert bottomSt != null;
// only add to PQ, if the ScoreTerm is greater than the current bottom,
// as all entries will be enqueued after the current bottom and will never be visible
if (st.compareTo(bottomSt) < 0) {
stQueue.offer(st);
}
reusableST = stQueue.insertWithOverflow(reusableST);
}
//System.out.println("current: "+st.term+"("+st.score+"), bottom: "+bottomSt.term+"("+bottomSt.score+")");
} while (enumerator.next());
} finally {
enumerator.close();
}
BooleanQuery query = new BooleanQuery(true);
int size = stQueue.size();
int size = Math.min(stQueue.size(), maxSize);
for(int i = 0; i < size; i++){
ScoreTerm st = stQueue.pop();
ScoreTerm st = stQueue.poll();
TermQuery tq = new TermQuery(st.term); // found a match
tq.setBoost(getBoost() * st.score); // set the boost
query.add(tq, BooleanClause.Occur.SHOULD); // add to query
@ -173,10 +173,28 @@ public class FuzzyQuery extends MultiTermQuery {
return query;
}
protected static class ScoreTerm implements Comparable<ScoreTerm> {
public Term term;
public float score;
public ScoreTerm(Term term, float score){
this.term = term;
this.score = score;
}
public int compareTo(ScoreTerm other) {
if (this.score == other.score)
return this.term.compareTo(other.term);
else
// inverse ordering!!!
return Float.compare(other.score, this.score);
}
}
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
final StringBuilder buffer = new StringBuilder();
if (!term.field().equals(field)) {
buffer.append(term.field());
buffer.append(":");
@ -188,32 +206,6 @@ public class FuzzyQuery extends MultiTermQuery {
return buffer.toString();
}
protected static class ScoreTerm {
public Term term;
public float score;
public ScoreTerm(Term term, float score){
this.term = term;
this.score = score;
}
}
protected static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {
public ScoreTermQueue(int size){
initialize(size);
}
@Override
protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
if (termA.score == termB.score)
return termA.term.compareTo(termB.term) > 0;
else
return termA.score < termB.score;
}
}
@Override
public int hashCode() {
final int prime = 31;

View File

@ -17,6 +17,9 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.util.Set;
import java.util.HashSet;
import java.util.Arrays;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@ -76,6 +79,23 @@ public class TestFuzzyQuery extends LuceneTestCase {
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
// test BooleanQuery.maxClauseCount
int savedClauseCount = BooleanQuery.getMaxClauseCount();
try {
BooleanQuery.setMaxClauseCount(2);
// This query would normally return 3 documents, because 3 terms match:
query = new FuzzyQuery(new Term("field", "aaaab"), FuzzyQuery.defaultMinSimilarity, 3);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("only 2 documents should match", 2, hits.length);
Set<String> possibleTerms = new HashSet<String>(Arrays.asList("aaaaa","aaaab"));
for (int i = 0; i < hits.length; i++) {
final String term = searcher.doc(hits[i].doc).get("field");
assertTrue("term '" + term + "' should not appear in results", possibleTerms.contains(term));
}
} finally {
BooleanQuery.setMaxClauseCount(savedClauseCount);
}
// not similar enough:
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);