FuzzyQuery is now using a default prefix length of 2 in

order to become usable for big indices. Furthermore, it
no longer throws TooManyClausesException.


git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150572 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christoph Goller 2004-10-06 16:19:52 +00:00
parent 435e471111
commit a433e2c1de
7 changed files with 268 additions and 47 deletions

View File

@ -74,6 +74,7 @@ public class QueryParser implements QueryParserConstants {
String field;
int phraseSlop = 0;
float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity;
int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
Locale locale = Locale.getDefault();
/** Parses a query string, returning a {@link org.apache.lucene.search.Query}.
@ -131,18 +132,36 @@ public class QueryParser implements QueryParserConstants {
}
/**
* Get the default minimal similarity for fuzzy queries.
* Get the minimal similarity for fuzzy queries.
*/
public float getFuzzyMinSim() {
return fuzzyMinSim;
}
/**
*Set the default minimum similarity for fuzzy queries.
* Set the minimum similarity for fuzzy queries.
* Default is 0.5f.
*/
public void setFuzzyMinSim(float fuzzyMinSim) {
this.fuzzyMinSim = fuzzyMinSim;
}
/**
* Get the prefix length for fuzzy queries.
* @return Returns the fuzzyPrefixLength.
*/
public int getFuzzyPrefixLength() {
return fuzzyPrefixLength;
}
/**
* Set the prefix length for fuzzy queries. Default is 2.
* @param fuzzyPrefixLength The fuzzyPrefixLength to set.
*/
public void setFuzzyPrefixLength(int fuzzyPrefixLength) {
this.fuzzyPrefixLength = fuzzyPrefixLength;
}
/**
* Sets the default slop for phrases. If zero, then exact phrase matches
* are required. Default value is zero.
@ -419,7 +438,7 @@ public class QueryParser implements QueryParserConstants {
return new PrefixQuery(t);
}
/**
/**
* Factory method for generating a query (similar to
* ({@link #getWildcardQuery}). Called when parser parses
* an input term token that has the fuzzy suffix (~) appended.
@ -433,7 +452,7 @@ public class QueryParser implements QueryParserConstants {
protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException
{
Term t = new Term(field, termStr);
return new FuzzyQuery(t, minSimilarity);
return new FuzzyQuery(t, minSimilarity, fuzzyPrefixLength);
}
/**

View File

@ -97,6 +97,7 @@ public class QueryParser {
String field;
int phraseSlop = 0;
float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity;
int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
Locale locale = Locale.getDefault();
/** Parses a query string, returning a {@link org.apache.lucene.search.Query}.
@ -154,17 +155,35 @@ public class QueryParser {
}
/**
* Get the default minimal similarity for fuzzy queries.
* Get the minimal similarity for fuzzy queries.
*/
public float getFuzzyMinSim() {
return fuzzyMinSim;
}
/**
*Set the default minimum similarity for fuzzy queries.
* Set the minimum similarity for fuzzy queries.
* Default is 0.5f.
*/
public void setFuzzyMinSim(float fuzzyMinSim) {
this.fuzzyMinSim = fuzzyMinSim;
}
/**
* Get the prefix length for fuzzy queries.
* @return Returns the fuzzyPrefixLength.
*/
public int getFuzzyPrefixLength() {
return fuzzyPrefixLength;
}
/**
* Set the prefix length for fuzzy queries. Default is 2.
* @param fuzzyPrefixLength The fuzzyPrefixLength to set.
*/
public void setFuzzyPrefixLength(int fuzzyPrefixLength) {
this.fuzzyPrefixLength = fuzzyPrefixLength;
}
/**
* Sets the default slop for phrases. If zero, then exact phrase matches
@ -441,8 +460,8 @@ public class QueryParser {
Term t = new Term(field, termStr);
return new PrefixQuery(t);
}
/**
/**
* Factory method for generating a query (similar to
* ({@link #getWildcardQuery}). Called when parser parses
* an input term token that has the fuzzy suffix (~) appended.
@ -456,7 +475,7 @@ public class QueryParser {
protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException
{
Term t = new Term(field, termStr);
return new FuzzyQuery(t, minSimilarity);
return new FuzzyQuery(t, minSimilarity, fuzzyPrefixLength);
}
/**

View File

@ -34,7 +34,7 @@ public abstract class FilteredTermEnum extends TermEnum {
protected abstract boolean termCompare(Term term);
/** Equality measure on the term */
protected abstract float difference();
public abstract float difference();
/** Indiciates the end of the enumeration has been reached */
protected abstract boolean endEnum();

View File

@ -18,6 +18,8 @@ package org.apache.lucene.search;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.PriorityQueue;
import java.io.IOException;
/** Implements the fuzzy search query. The similiarity measurement
@ -26,6 +28,8 @@ import java.io.IOException;
public final class FuzzyQuery extends MultiTermQuery {
public final static float defaultMinSimilarity = 0.5f;
public final static int defaultPrefixLength = 2;
private float minimumSimilarity;
private int prefixLength;
@ -48,16 +52,14 @@ public final class FuzzyQuery extends MultiTermQuery {
public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException {
super(term);
if (minimumSimilarity > 1.0f)
throw new IllegalArgumentException("minimumSimilarity > 1");
if (minimumSimilarity >= 1.0f)
throw new IllegalArgumentException("minimumSimilarity >= 1");
else if (minimumSimilarity < 0.0f)
throw new IllegalArgumentException("minimumSimilarity < 0");
this.minimumSimilarity = minimumSimilarity;
this.minimumSimilarity = minimumSimilarity;
if(prefixLength < 0)
throw new IllegalArgumentException("prefixLength < 0");
else if(prefixLength >= term.text().length())
throw new IllegalArgumentException("prefixLength >= term.text().length()");
this.prefixLength = prefixLength;
}
@ -65,14 +67,14 @@ public final class FuzzyQuery extends MultiTermQuery {
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}.
*/
public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
this(term, minimumSimilarity, 0);
this(term, minimumSimilarity, defaultPrefixLength);
}
/**
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}.
*/
public FuzzyQuery(Term term) {
this(term, defaultMinSimilarity, 0);
this(term, defaultMinSimilarity, defaultPrefixLength);
}
/**
@ -95,8 +97,74 @@ public final class FuzzyQuery extends MultiTermQuery {
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
}
public Query rewrite(IndexReader reader) throws IOException {
FilteredTermEnum enumerator = getEnum(reader);
int maxClauseCount = BooleanQuery.getMaxClauseCount();
ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount);
try {
do {
float minScore = 0.0f;
float score = 0.0f;
Term t = enumerator.term();
if (t != null) {
score = enumerator.difference();
// terms come in alphabetical order, therefore if queue is full and score
// not bigger than minScore, we can skip
if(stQueue.size() < maxClauseCount || score > minScore){
stQueue.insert(new ScoreTerm(t, score));
minScore = ((ScoreTerm)stQueue.top()).score; // maintain minScore
}
}
} while (enumerator.next());
} finally {
enumerator.close();
}
BooleanQuery query = new BooleanQuery();
int size = stQueue.size();
for(int i = 0; i < size; i++){
ScoreTerm st = (ScoreTerm) stQueue.pop();
TermQuery tq = new TermQuery(st.term); // found a match
tq.setBoost(getBoost() * st.score); // set the boost
query.add(tq, BooleanClause.Occur.SHOULD); // add to query
}
return query;
}
public String toString(String field) {
return super.toString(field) + '~' + Float.toString(minimumSimilarity);
}
private static class ScoreTerm{
public Term term;
public float score;
public ScoreTerm(Term term, float score){
this.term = term;
this.score = score;
}
}
private static class ScoreTermQueue extends PriorityQueue {
public ScoreTermQueue(int size){
initialize(size);
}
/* (non-Javadoc)
* @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
*/
protected boolean lessThan(Object a, Object b) {
ScoreTerm termA = (ScoreTerm)a;
ScoreTerm termB = (ScoreTerm)b;
if (termA.score == termB.score)
return termA.term.compareTo(termB.term) > 0;
else
return termA.score < termB.score;
}
}
}

View File

@ -25,7 +25,7 @@ import org.apache.lucene.index.Term;
<p>Term enumerations are always ordered by Term.compareTo(). Each term in
the enumeration is greater than all that precede it. */
public final class FuzzyTermEnum extends FilteredTermEnum {
double distance;
float similarity;
boolean endEnum = false;
Term searchTerm = null;
@ -35,7 +35,7 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
String prefix = "";
int prefixLength = 0;
float minimumSimilarity;
double scale_factor;
float scale_factor;
/**
@ -47,7 +47,7 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
* @see #FuzzyTermEnum(IndexReader, Term, float, int)
*/
public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
this(reader, term, FuzzyQuery.defaultMinSimilarity, 0);
this(reader, term, FuzzyQuery.defaultMinSimilarity, FuzzyQuery.defaultPrefixLength);
}
/**
@ -60,7 +60,7 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
* @see #FuzzyTermEnum(IndexReader, Term, float, int)
*/
public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException {
this(reader, term, minSimilarity, 0);
this(reader, term, minSimilarity, FuzzyQuery.defaultPrefixLength);
}
/**
@ -76,18 +76,30 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
*/
public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength) throws IOException {
super();
if (minimumSimilarity >= 1.0f)
throw new IllegalArgumentException("minimumSimilarity >= 1");
else if (minimumSimilarity < 0.0f)
throw new IllegalArgumentException("minimumSimilarity < 0");
minimumSimilarity = minSimilarity;
scale_factor = 1.0f / (1.0f - minimumSimilarity);
searchTerm = term;
field = searchTerm.field();
text = searchTerm.text();
textlen = text.length();
if(prefixLength > 0 && prefixLength < textlen){
this.prefixLength = prefixLength;
prefix = text.substring(0, prefixLength);
text = text.substring(prefixLength);
textlen = text.length();
}
if(prefixLength < 0)
throw new IllegalArgumentException("prefixLength < 0");
if(prefixLength > textlen)
prefixLength = textlen;
this.prefixLength = prefixLength;
prefix = text.substring(0, prefixLength);
text = text.substring(prefixLength);
textlen = text.length();
setEnum(reader.terms(new Term(searchTerm.field(), prefix)));
}
@ -101,15 +113,15 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
String target = termText.substring(prefixLength);
int targetlen = target.length();
int dist = editDistance(text, target, textlen, targetlen);
distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
return (distance > minimumSimilarity);
similarity = 1 - ((float)dist / (float) (prefixLength + Math.min(textlen, targetlen)));
return (similarity > minimumSimilarity);
}
endEnum = true;
return false;
}
protected final float difference() {
return (float)((distance - minimumSimilarity) * scale_factor);
public final float difference() {
return (float)((similarity - minimumSimilarity) * scale_factor);
}
public final boolean endEnum() {

View File

@ -248,10 +248,10 @@ public class TestQueryParser extends TestCase {
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
FuzzyQuery fq = (FuzzyQuery)getQuery("term~0.7", null);
assertEquals(0.7f, fq.getMinSimilarity(), 0.1f);
assertEquals(0, fq.getPrefixLength());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
fq = (FuzzyQuery)getQuery("term~", null);
assertEquals(0.5f, fq.getMinSimilarity(), 0.1f);
assertEquals(0, fq.getPrefixLength());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
try {
getQuery("term~1.1", null); // value > 1, throws exception
fail();

View File

@ -47,20 +47,40 @@ public class TestFuzzyQuery extends TestCase {
writer.close();
IndexSearcher searcher = new IndexSearcher(directory);
FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"));
FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
Hits hits = searcher.search(query);
assertEquals(3, hits.length());
// same with prefix
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.search(query);
assertEquals(3, hits.length());
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.search(query);
assertEquals(3, hits.length());
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 3);
hits = searcher.search(query);
assertEquals(3, hits.length());
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 4);
hits = searcher.search(query);
assertEquals(2, hits.length());
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 5);
hits = searcher.search(query);
assertEquals(1, hits.length());
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6);
hits = searcher.search(query);
assertEquals(1, hits.length());
// not similar enough:
query = new FuzzyQuery(new Term("field", "xxxxx"));
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query);
assertEquals(0, hits.length());
query = new FuzzyQuery(new Term("field", "aaccc")); // edit distance to "aaaaa" = 3
query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3
hits = searcher.search(query);
assertEquals(0, hits.length());
// query identical to a word in the index:
query = new FuzzyQuery(new Term("field", "aaaaa"));
query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query);
assertEquals(3, hits.length());
assertEquals(hits.doc(0).get("field"), ("aaaaa"));
@ -69,20 +89,71 @@ public class TestFuzzyQuery extends TestCase {
assertEquals(hits.doc(2).get("field"), ("aaabb"));
// query similar to a word in the index:
query = new FuzzyQuery(new Term("field", "aaaac"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query);
assertEquals(3, hits.length());
assertEquals(hits.doc(0).get("field"), ("aaaaa"));
assertEquals(hits.doc(1).get("field"), ("aaaab"));
assertEquals(hits.doc(2).get("field"), ("aaabb"));
// now with prefix
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.search(query);
assertEquals(3, hits.length());
assertEquals(hits.doc(0).get("field"), ("aaaaa"));
assertEquals(hits.doc(1).get("field"), ("aaaab"));
assertEquals(hits.doc(2).get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.search(query);
assertEquals(3, hits.length());
assertEquals(hits.doc(0).get("field"), ("aaaaa"));
assertEquals(hits.doc(1).get("field"), ("aaaab"));
assertEquals(hits.doc(2).get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 3);
hits = searcher.search(query);
assertEquals(3, hits.length());
assertEquals(hits.doc(0).get("field"), ("aaaaa"));
assertEquals(hits.doc(1).get("field"), ("aaaab"));
assertEquals(hits.doc(2).get("field"), ("aaabb"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 4);
hits = searcher.search(query);
assertEquals(2, hits.length());
assertEquals(hits.doc(0).get("field"), ("aaaaa"));
assertEquals(hits.doc(1).get("field"), ("aaaab"));
query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 5);
hits = searcher.search(query);
assertEquals(0, hits.length());
query = new FuzzyQuery(new Term("field", "ddddX"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query);
assertEquals(1, hits.length());
assertEquals(hits.doc(0).get("field"), ("ddddd"));
// now with prefix
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.search(query);
assertEquals(1, hits.length());
assertEquals(hits.doc(0).get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.search(query);
assertEquals(1, hits.length());
assertEquals(hits.doc(0).get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 3);
hits = searcher.search(query);
assertEquals(1, hits.length());
assertEquals(hits.doc(0).get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 4);
hits = searcher.search(query);
assertEquals(1, hits.length());
assertEquals(hits.doc(0).get("field"), ("ddddd"));
query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 5);
hits = searcher.search(query);
assertEquals(0, hits.length());
// different field = no match:
query = new FuzzyQuery(new Term("anotherfield", "ddddX"));
query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query);
assertEquals(0, hits.length());
@ -101,31 +172,63 @@ public class TestFuzzyQuery extends TestCase {
FuzzyQuery query;
// not similar enough:
query = new FuzzyQuery(new Term("field", "xxxxx"));
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
Hits hits = searcher.search(query);
assertEquals(0, hits.length());
// edit distance to "aaaaaaa" = 3, this matches because the string is longer than
// in testDefaultFuzziness so a bigger difference is allowed:
query = new FuzzyQuery(new Term("field", "aaaaccc"));
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query);
assertEquals(1, hits.length());
assertEquals(hits.doc(0).get("field"), ("aaaaaaa"));
// now with prefix
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.search(query);
assertEquals(1, hits.length());
assertEquals(hits.doc(0).get("field"), ("aaaaaaa"));
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 4);
hits = searcher.search(query);
assertEquals(1, hits.length());
assertEquals(hits.doc(0).get("field"), ("aaaaaaa"));
query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 5);
hits = searcher.search(query);
assertEquals(0, hits.length());
// no match, more than half of the characters is wrong:
query = new FuzzyQuery(new Term("field", "aaacccc"));
query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query);
assertEquals(0, hits.length());
// now with prefix
query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.search(query);
assertEquals(0, hits.length());
// "student" and "stellent" are indeed similar to "segment" by default:
query = new FuzzyQuery(new Term("field", "student"));
query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query);
assertEquals(1, hits.length());
query = new FuzzyQuery(new Term("field", "stellent"));
query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query);
assertEquals(1, hits.length());
// now with prefix
query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.search(query);
assertEquals(1, hits.length());
query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 1);
hits = searcher.search(query);
assertEquals(1, hits.length());
query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.search(query);
assertEquals(0, hits.length());
query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 2);
hits = searcher.search(query);
assertEquals(0, hits.length());
// "student" doesn't match anymore thanks to increased minimum similarity:
query = new FuzzyQuery(new Term("field", "student"), 0.6f);
query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0);
hits = searcher.search(query);
assertEquals(0, hits.length());