LUCENE-504: Change FuzzyQuery to use java.utilPriorityQueue which grows dynamically to support BooleanQuery.maxClauseCount(Integer.MAX_VALUE) without exhausting all memory.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@833544 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2009-11-06 20:15:23 +00:00
parent 53b807726a
commit 15743fc179
2 changed files with 62 additions and 50 deletions

View File

@ -19,10 +19,10 @@ package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.ToStringUtils;
import java.io.IOException; import java.io.IOException;
import java.util.PriorityQueue;
/** Implements the fuzzy search query. The similarity measurement /** Implements the fuzzy search query. The similarity measurement
* is based on the Levenshtein (edit distance) algorithm. * is based on the Levenshtein (edit distance) algorithm.
@ -132,40 +132,40 @@ public class FuzzyQuery extends MultiTermQuery {
return new TermQuery(term); return new TermQuery(term);
} }
int maxSize = BooleanQuery.getMaxClauseCount();
PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(1024);
FilteredTermEnum enumerator = getEnum(reader); FilteredTermEnum enumerator = getEnum(reader);
int maxClauseCount = BooleanQuery.getMaxClauseCount();
ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
ScoreTerm reusableST = null;
try { try {
ScoreTerm bottomSt = null;
do { do {
float score = 0.0f; final Term t = enumerator.term();
Term t = enumerator.term(); if (t == null) break;
if (t != null) { ScoreTerm st = new ScoreTerm(t, enumerator.difference());
score = enumerator.difference(); if (stQueue.size() < maxSize) {
if (reusableST == null) { // record the current bottom item
reusableST = new ScoreTerm(t, score); if (bottomSt == null || st.compareTo(bottomSt) > 0) {
} else if (score >= reusableST.score) { bottomSt = st;
// reusableST holds the last "rejected" entry, so, if }
// this new score is not better than that, there's no // add to PQ, as it is not yet filled up
// need to try inserting it stQueue.offer(st);
reusableST.score = score;
reusableST.term = t;
} else { } else {
continue; assert bottomSt != null;
// only add to PQ, if the ScoreTerm is greater than the current bottom,
// as all entries will be enqueued after the current bottom and will never be visible
if (st.compareTo(bottomSt) < 0) {
stQueue.offer(st);
} }
reusableST = stQueue.insertWithOverflow(reusableST);
} }
//System.out.println("current: "+st.term+"("+st.score+"), bottom: "+bottomSt.term+"("+bottomSt.score+")");
} while (enumerator.next()); } while (enumerator.next());
} finally { } finally {
enumerator.close(); enumerator.close();
} }
BooleanQuery query = new BooleanQuery(true); BooleanQuery query = new BooleanQuery(true);
int size = stQueue.size(); int size = Math.min(stQueue.size(), maxSize);
for(int i = 0; i < size; i++){ for(int i = 0; i < size; i++){
ScoreTerm st = stQueue.pop(); ScoreTerm st = stQueue.poll();
TermQuery tq = new TermQuery(st.term); // found a match TermQuery tq = new TermQuery(st.term); // found a match
tq.setBoost(getBoost() * st.score); // set the boost tq.setBoost(getBoost() * st.score); // set the boost
query.add(tq, BooleanClause.Occur.SHOULD); // add to query query.add(tq, BooleanClause.Occur.SHOULD); // add to query
@ -174,9 +174,27 @@ public class FuzzyQuery extends MultiTermQuery {
return query; return query;
} }
protected static class ScoreTerm implements Comparable<ScoreTerm> {
public Term term;
public float score;
public ScoreTerm(Term term, float score){
this.term = term;
this.score = score;
}
public int compareTo(ScoreTerm other) {
if (this.score == other.score)
return this.term.compareTo(other.term);
else
// inverse ordering!!!
return Float.compare(other.score, this.score);
}
}
@Override @Override
public String toString(String field) { public String toString(String field) {
StringBuilder buffer = new StringBuilder(); final StringBuilder buffer = new StringBuilder();
if (!term.field().equals(field)) { if (!term.field().equals(field)) {
buffer.append(term.field()); buffer.append(term.field());
buffer.append(":"); buffer.append(":");
@ -188,32 +206,6 @@ public class FuzzyQuery extends MultiTermQuery {
return buffer.toString(); return buffer.toString();
} }
protected static class ScoreTerm {
public Term term;
public float score;
public ScoreTerm(Term term, float score){
this.term = term;
this.score = score;
}
}
protected static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {
public ScoreTermQueue(int size){
initialize(size);
}
@Override
protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
if (termA.score == termB.score)
return termA.term.compareTo(termB.term) > 0;
else
return termA.score < termB.score;
}
}
@Override @Override
public int hashCode() { public int hashCode() {
final int prime = 31; final int prime = 31;

View File

@ -17,6 +17,9 @@ package org.apache.lucene.search;
* limitations under the License. * limitations under the License.
*/ */
import java.util.Set;
import java.util.HashSet;
import java.util.Arrays;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
@ -77,6 +80,23 @@ public class TestFuzzyQuery extends LuceneTestCase {
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length); assertEquals(1, hits.length);
// test BooleanQuery.maxClauseCount
int savedClauseCount = BooleanQuery.getMaxClauseCount();
try {
BooleanQuery.setMaxClauseCount(2);
// This query would normally return 3 documents, because 3 terms match:
query = new FuzzyQuery(new Term("field", "aaaab"), FuzzyQuery.defaultMinSimilarity, 3);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("only 2 documents should match", 2, hits.length);
Set<String> possibleTerms = new HashSet<String>(Arrays.asList("aaaaa","aaaab"));
for (int i = 0; i < hits.length; i++) {
final String term = searcher.doc(hits[i].doc).get("field");
assertTrue("term '" + term + "' should not appear in results", possibleTerms.contains(term));
}
} finally {
BooleanQuery.setMaxClauseCount(savedClauseCount);
}
// not similar enough: // not similar enough:
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0); query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query, null, 1000).scoreDocs; hits = searcher.search(query, null, 1000).scoreDocs;