mirror of https://github.com/apache/lucene.git
LUCENE-504: Change FuzzyQuery to use java.utilPriorityQueue which grows dynamically to support BooleanQuery.maxClauseCount(Integer.MAX_VALUE) without exhausting all memory.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@833544 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
53b807726a
commit
15743fc179
|
@ -19,10 +19,10 @@ package org.apache.lucene.search;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.util.PriorityQueue;
|
|
||||||
import org.apache.lucene.util.ToStringUtils;
|
import org.apache.lucene.util.ToStringUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.PriorityQueue;
|
||||||
|
|
||||||
/** Implements the fuzzy search query. The similarity measurement
|
/** Implements the fuzzy search query. The similarity measurement
|
||||||
* is based on the Levenshtein (edit distance) algorithm.
|
* is based on the Levenshtein (edit distance) algorithm.
|
||||||
|
@ -132,40 +132,40 @@ public class FuzzyQuery extends MultiTermQuery {
|
||||||
return new TermQuery(term);
|
return new TermQuery(term);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int maxSize = BooleanQuery.getMaxClauseCount();
|
||||||
|
PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(1024);
|
||||||
FilteredTermEnum enumerator = getEnum(reader);
|
FilteredTermEnum enumerator = getEnum(reader);
|
||||||
int maxClauseCount = BooleanQuery.getMaxClauseCount();
|
|
||||||
ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
|
|
||||||
ScoreTerm reusableST = null;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
ScoreTerm bottomSt = null;
|
||||||
do {
|
do {
|
||||||
float score = 0.0f;
|
final Term t = enumerator.term();
|
||||||
Term t = enumerator.term();
|
if (t == null) break;
|
||||||
if (t != null) {
|
ScoreTerm st = new ScoreTerm(t, enumerator.difference());
|
||||||
score = enumerator.difference();
|
if (stQueue.size() < maxSize) {
|
||||||
if (reusableST == null) {
|
// record the current bottom item
|
||||||
reusableST = new ScoreTerm(t, score);
|
if (bottomSt == null || st.compareTo(bottomSt) > 0) {
|
||||||
} else if (score >= reusableST.score) {
|
bottomSt = st;
|
||||||
// reusableST holds the last "rejected" entry, so, if
|
}
|
||||||
// this new score is not better than that, there's no
|
// add to PQ, as it is not yet filled up
|
||||||
// need to try inserting it
|
stQueue.offer(st);
|
||||||
reusableST.score = score;
|
|
||||||
reusableST.term = t;
|
|
||||||
} else {
|
} else {
|
||||||
continue;
|
assert bottomSt != null;
|
||||||
|
// only add to PQ, if the ScoreTerm is greater than the current bottom,
|
||||||
|
// as all entries will be enqueued after the current bottom and will never be visible
|
||||||
|
if (st.compareTo(bottomSt) < 0) {
|
||||||
|
stQueue.offer(st);
|
||||||
}
|
}
|
||||||
|
|
||||||
reusableST = stQueue.insertWithOverflow(reusableST);
|
|
||||||
}
|
}
|
||||||
|
//System.out.println("current: "+st.term+"("+st.score+"), bottom: "+bottomSt.term+"("+bottomSt.score+")");
|
||||||
} while (enumerator.next());
|
} while (enumerator.next());
|
||||||
} finally {
|
} finally {
|
||||||
enumerator.close();
|
enumerator.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
BooleanQuery query = new BooleanQuery(true);
|
BooleanQuery query = new BooleanQuery(true);
|
||||||
int size = stQueue.size();
|
int size = Math.min(stQueue.size(), maxSize);
|
||||||
for(int i = 0; i < size; i++){
|
for(int i = 0; i < size; i++){
|
||||||
ScoreTerm st = stQueue.pop();
|
ScoreTerm st = stQueue.poll();
|
||||||
TermQuery tq = new TermQuery(st.term); // found a match
|
TermQuery tq = new TermQuery(st.term); // found a match
|
||||||
tq.setBoost(getBoost() * st.score); // set the boost
|
tq.setBoost(getBoost() * st.score); // set the boost
|
||||||
query.add(tq, BooleanClause.Occur.SHOULD); // add to query
|
query.add(tq, BooleanClause.Occur.SHOULD); // add to query
|
||||||
|
@ -174,9 +174,27 @@ public class FuzzyQuery extends MultiTermQuery {
|
||||||
return query;
|
return query;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected static class ScoreTerm implements Comparable<ScoreTerm> {
|
||||||
|
public Term term;
|
||||||
|
public float score;
|
||||||
|
|
||||||
|
public ScoreTerm(Term term, float score){
|
||||||
|
this.term = term;
|
||||||
|
this.score = score;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int compareTo(ScoreTerm other) {
|
||||||
|
if (this.score == other.score)
|
||||||
|
return this.term.compareTo(other.term);
|
||||||
|
else
|
||||||
|
// inverse ordering!!!
|
||||||
|
return Float.compare(other.score, this.score);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString(String field) {
|
public String toString(String field) {
|
||||||
StringBuilder buffer = new StringBuilder();
|
final StringBuilder buffer = new StringBuilder();
|
||||||
if (!term.field().equals(field)) {
|
if (!term.field().equals(field)) {
|
||||||
buffer.append(term.field());
|
buffer.append(term.field());
|
||||||
buffer.append(":");
|
buffer.append(":");
|
||||||
|
@ -188,32 +206,6 @@ public class FuzzyQuery extends MultiTermQuery {
|
||||||
return buffer.toString();
|
return buffer.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static class ScoreTerm {
|
|
||||||
public Term term;
|
|
||||||
public float score;
|
|
||||||
|
|
||||||
public ScoreTerm(Term term, float score){
|
|
||||||
this.term = term;
|
|
||||||
this.score = score;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {
|
|
||||||
|
|
||||||
public ScoreTermQueue(int size){
|
|
||||||
initialize(size);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
|
|
||||||
if (termA.score == termB.score)
|
|
||||||
return termA.term.compareTo(termB.term) > 0;
|
|
||||||
else
|
|
||||||
return termA.score < termB.score;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
final int prime = 31;
|
final int prime = 31;
|
||||||
|
|
|
@ -17,6 +17,9 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
@ -77,6 +80,23 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
assertEquals(1, hits.length);
|
assertEquals(1, hits.length);
|
||||||
|
|
||||||
|
// test BooleanQuery.maxClauseCount
|
||||||
|
int savedClauseCount = BooleanQuery.getMaxClauseCount();
|
||||||
|
try {
|
||||||
|
BooleanQuery.setMaxClauseCount(2);
|
||||||
|
// This query would normally return 3 documents, because 3 terms match:
|
||||||
|
query = new FuzzyQuery(new Term("field", "aaaab"), FuzzyQuery.defaultMinSimilarity, 3);
|
||||||
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
assertEquals("only 2 documents should match", 2, hits.length);
|
||||||
|
Set<String> possibleTerms = new HashSet<String>(Arrays.asList("aaaaa","aaaab"));
|
||||||
|
for (int i = 0; i < hits.length; i++) {
|
||||||
|
final String term = searcher.doc(hits[i].doc).get("field");
|
||||||
|
assertTrue("term '" + term + "' should not appear in results", possibleTerms.contains(term));
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
BooleanQuery.setMaxClauseCount(savedClauseCount);
|
||||||
|
}
|
||||||
|
|
||||||
// not similar enough:
|
// not similar enough:
|
||||||
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
|
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
|
||||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||||
|
|
Loading…
Reference in New Issue