You can now set the required fuzziness of FuzzyQuery. Note that QueryParser does not support this (yet?).

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150422 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Daniel Naber 2004-08-13 18:35:02 +00:00
parent c65cb9a931
commit cd2ca90309
3 changed files with 63 additions and 11 deletions

View File

@ -20,14 +20,43 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import java.io.IOException;
/** Implements the fuzzy search query */
/** Implements the fuzzy search query. The similiarity measurement
* is based on the Levenshtein (edit distance) algorithm.
*/
public final class FuzzyQuery extends MultiTermQuery {
public FuzzyQuery(Term term) {
private float minimumSimilarity;
/**
* Create a new FuzzyQuery that will match terms with a similarity
* of at least <code>minimumSimilarity</code> to <code>term</code>.
*
* @param term the term to search for
* @param minimumSimilarity a value between 0 and 1 to set the required similarity
* between the query term and the matching terms. For example, for a
* <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than <code>length(term)*0.5</code>.
* @throws IllegalArgumentException if minimumSimilarity is &gt; 1 or &lt; 0
*/
public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
super(term);
if (minimumSimilarity > 1.0f)
throw new IllegalArgumentException("minimumSimilarity > 1");
else if (minimumSimilarity < 0.0f)
throw new IllegalArgumentException("minimumSimilarity < 0");
this.minimumSimilarity = minimumSimilarity;
}
/**
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f)}.
*/
public FuzzyQuery(Term term) {
this(term, 0.5f);
}
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
return new FuzzyTermEnum(reader, getTerm());
return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity);
}
public String toString(String field) {

View File

@ -26,16 +26,24 @@ import org.apache.lucene.index.Term;
the enumeration is greater than all that precede it. */
public final class FuzzyTermEnum extends FilteredTermEnum {
double distance;
boolean fieldMatch = false;
boolean endEnum = false;
Term searchTerm = null;
String field = "";
String text = "";
int textlen;
float minimumSimilarity;
double scale_factor;
public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
this(reader, term, 0.5f);
}
public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException {
super();
minimumSimilarity = minSimilarity;
scale_factor = 1.0f / (1.0f - minimumSimilarity);
searchTerm = term;
field = searchTerm.field();
text = searchTerm.text();
@ -53,14 +61,14 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
int targetlen = target.length();
int dist = editDistance(text, target, textlen, targetlen);
distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
return (distance > FUZZY_THRESHOLD);
return (distance > minimumSimilarity);
}
endEnum = true;
return false;
}
protected final float difference() {
return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR);
return (float)((distance - minimumSimilarity) * scale_factor);
}
public final boolean endEnum() {
@ -71,9 +79,6 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
* Compute Levenshtein distance
******************************/
public static final double FUZZY_THRESHOLD = 0.5;
public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD);
/**
Finds and returns the smallest of three integers
*/

View File

@ -33,7 +33,7 @@ import org.apache.lucene.store.RAMDirectory;
*/
public class TestFuzzyQuery extends TestCase {
public void testDefaultFuzziness() throws Exception {
public void testFuzziness() throws Exception {
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
addDoc("aaaaa", writer);
@ -90,7 +90,7 @@ public class TestFuzzyQuery extends TestCase {
directory.close();
}
public void testDefaultFuzzinessLong() throws Exception {
public void testFuzzinessLong() throws Exception {
RAMDirectory directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
addDoc("aaaaaaa", writer);
@ -124,6 +124,24 @@ public class TestFuzzyQuery extends TestCase {
hits = searcher.search(query);
assertEquals(1, hits.length());
// "student" doesn't match anymore thanks to increased minimum similarity:
query = new FuzzyQuery(new Term("field", "student"), 0.6f);
hits = searcher.search(query);
assertEquals(0, hits.length());
try {
query = new FuzzyQuery(new Term("field", "student"), 1.1f);
fail("Expected IllegalArgumentException");
} catch (IllegalArgumentException e) {
// expecting exception
}
try {
query = new FuzzyQuery(new Term("field", "student"), -0.1f);
fail("Expected IllegalArgumentException");
} catch (IllegalArgumentException e) {
// expecting exception
}
searcher.close();
directory.close();
}