mirror of https://github.com/apache/lucene.git
LUCENE-504: Change FuzzyQuery to use java.utilPriorityQueue which grows dynamically to support BooleanQuery.maxClauseCount(Integer.MAX_VALUE) without exhausting all memory.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@833544 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
53b807726a
commit
15743fc179
|
@ -19,10 +19,10 @@ package org.apache.lucene.search;
|
|||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.PriorityQueue;
|
||||
|
||||
/** Implements the fuzzy search query. The similarity measurement
|
||||
* is based on the Levenshtein (edit distance) algorithm.
|
||||
|
@ -132,40 +132,40 @@ public class FuzzyQuery extends MultiTermQuery {
|
|||
return new TermQuery(term);
|
||||
}
|
||||
|
||||
int maxSize = BooleanQuery.getMaxClauseCount();
|
||||
PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(1024);
|
||||
FilteredTermEnum enumerator = getEnum(reader);
|
||||
int maxClauseCount = BooleanQuery.getMaxClauseCount();
|
||||
ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
|
||||
ScoreTerm reusableST = null;
|
||||
|
||||
try {
|
||||
ScoreTerm bottomSt = null;
|
||||
do {
|
||||
float score = 0.0f;
|
||||
Term t = enumerator.term();
|
||||
if (t != null) {
|
||||
score = enumerator.difference();
|
||||
if (reusableST == null) {
|
||||
reusableST = new ScoreTerm(t, score);
|
||||
} else if (score >= reusableST.score) {
|
||||
// reusableST holds the last "rejected" entry, so, if
|
||||
// this new score is not better than that, there's no
|
||||
// need to try inserting it
|
||||
reusableST.score = score;
|
||||
reusableST.term = t;
|
||||
} else {
|
||||
continue;
|
||||
final Term t = enumerator.term();
|
||||
if (t == null) break;
|
||||
ScoreTerm st = new ScoreTerm(t, enumerator.difference());
|
||||
if (stQueue.size() < maxSize) {
|
||||
// record the current bottom item
|
||||
if (bottomSt == null || st.compareTo(bottomSt) > 0) {
|
||||
bottomSt = st;
|
||||
}
|
||||
// add to PQ, as it is not yet filled up
|
||||
stQueue.offer(st);
|
||||
} else {
|
||||
assert bottomSt != null;
|
||||
// only add to PQ, if the ScoreTerm is greater than the current bottom,
|
||||
// as all entries will be enqueued after the current bottom and will never be visible
|
||||
if (st.compareTo(bottomSt) < 0) {
|
||||
stQueue.offer(st);
|
||||
}
|
||||
|
||||
reusableST = stQueue.insertWithOverflow(reusableST);
|
||||
}
|
||||
//System.out.println("current: "+st.term+"("+st.score+"), bottom: "+bottomSt.term+"("+bottomSt.score+")");
|
||||
} while (enumerator.next());
|
||||
} finally {
|
||||
enumerator.close();
|
||||
}
|
||||
|
||||
BooleanQuery query = new BooleanQuery(true);
|
||||
int size = stQueue.size();
|
||||
int size = Math.min(stQueue.size(), maxSize);
|
||||
for(int i = 0; i < size; i++){
|
||||
ScoreTerm st = stQueue.pop();
|
||||
ScoreTerm st = stQueue.poll();
|
||||
TermQuery tq = new TermQuery(st.term); // found a match
|
||||
tq.setBoost(getBoost() * st.score); // set the boost
|
||||
query.add(tq, BooleanClause.Occur.SHOULD); // add to query
|
||||
|
@ -173,10 +173,28 @@ public class FuzzyQuery extends MultiTermQuery {
|
|||
|
||||
return query;
|
||||
}
|
||||
|
||||
protected static class ScoreTerm implements Comparable<ScoreTerm> {
|
||||
public Term term;
|
||||
public float score;
|
||||
|
||||
public ScoreTerm(Term term, float score){
|
||||
this.term = term;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
public int compareTo(ScoreTerm other) {
|
||||
if (this.score == other.score)
|
||||
return this.term.compareTo(other.term);
|
||||
else
|
||||
// inverse ordering!!!
|
||||
return Float.compare(other.score, this.score);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
final StringBuilder buffer = new StringBuilder();
|
||||
if (!term.field().equals(field)) {
|
||||
buffer.append(term.field());
|
||||
buffer.append(":");
|
||||
|
@ -188,32 +206,6 @@ public class FuzzyQuery extends MultiTermQuery {
|
|||
return buffer.toString();
|
||||
}
|
||||
|
||||
protected static class ScoreTerm {
|
||||
public Term term;
|
||||
public float score;
|
||||
|
||||
public ScoreTerm(Term term, float score){
|
||||
this.term = term;
|
||||
this.score = score;
|
||||
}
|
||||
}
|
||||
|
||||
protected static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {
|
||||
|
||||
public ScoreTermQueue(int size){
|
||||
initialize(size);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
|
||||
if (termA.score == termB.score)
|
||||
return termA.term.compareTo(termB.term) > 0;
|
||||
else
|
||||
return termA.score < termB.score;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
|
|
|
@ -17,6 +17,9 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.util.Arrays;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
|
@ -76,6 +79,23 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
|||
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6);
|
||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||
assertEquals(1, hits.length);
|
||||
|
||||
// test BooleanQuery.maxClauseCount
|
||||
int savedClauseCount = BooleanQuery.getMaxClauseCount();
|
||||
try {
|
||||
BooleanQuery.setMaxClauseCount(2);
|
||||
// This query would normally return 3 documents, because 3 terms match:
|
||||
query = new FuzzyQuery(new Term("field", "aaaab"), FuzzyQuery.defaultMinSimilarity, 3);
|
||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||
assertEquals("only 2 documents should match", 2, hits.length);
|
||||
Set<String> possibleTerms = new HashSet<String>(Arrays.asList("aaaaa","aaaab"));
|
||||
for (int i = 0; i < hits.length; i++) {
|
||||
final String term = searcher.doc(hits[i].doc).get("field");
|
||||
assertTrue("term '" + term + "' should not appear in results", possibleTerms.contains(term));
|
||||
}
|
||||
} finally {
|
||||
BooleanQuery.setMaxClauseCount(savedClauseCount);
|
||||
}
|
||||
|
||||
// not similar enough:
|
||||
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
|
||||
|
|
Loading…
Reference in New Issue