You can now set the required fuzziness of FuzzyQuery. Note that QueryParser does not support this (yet?).

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150422 13f79535-47bb-0310-9956-ffa450edef68
2004-08-13 18:35:02 +00:00 · 2004-08-13 18:35:02 +00:00 · cd2ca90309
parent c65cb9a931
commit cd2ca90309
3 changed files with 63 additions and 11 deletions
--- a/src/java/org/apache/lucene/search/FuzzyQuery.java
+++ b/src/java/org/apache/lucene/search/FuzzyQuery.java
@ -20,14 +20,43 @@ import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import java.io.IOException;

-/** Implements the fuzzy search query */
+/** Implements the fuzzy search query. The similiarity measurement
+ * is based on the Levenshtein (edit distance) algorithm.
+ */
 public final class FuzzyQuery extends MultiTermQuery {
-  public FuzzyQuery(Term term) {
+  
+  private float minimumSimilarity;
+  
+  /**
+   * Create a new FuzzyQuery that will match terms with a similarity 
+   * of at least <code>minimumSimilarity</code> to <code>term</code>.
+   * 
+   * @param term the term to search for
+   * @param minimumSimilarity a value between 0 and 1 to set the required similarity
+   *  between the query term and the matching terms. For example, for a
+   *  <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
+   *  as the query term is considered similar to the query term if the edit distance
+   *  between both terms is less than <code>length(term)*0.5</code>.
+   * @throws IllegalArgumentException if minimumSimilarity is &gt; 1 or &lt; 0
+   */
+  public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
    super(term);
+    if (minimumSimilarity > 1.0f)
+      throw new IllegalArgumentException("minimumSimilarity > 1");
+    else if (minimumSimilarity < 0.0f)
+      throw new IllegalArgumentException("minimumSimilarity < 0");
+    this.minimumSimilarity = minimumSimilarity;
+  }
+
+  /**
+   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f)}.
+   */
+  public FuzzyQuery(Term term) {
+    this(term, 0.5f);
  }
    
  protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
-    return new FuzzyTermEnum(reader, getTerm());
+    return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity);
  }
    
  public String toString(String field) {
--- a/src/java/org/apache/lucene/search/FuzzyTermEnum.java
+++ b/src/java/org/apache/lucene/search/FuzzyTermEnum.java
@ -26,16 +26,24 @@ import org.apache.lucene.index.Term;
  the enumeration is greater than all that precede it.  */
 public final class FuzzyTermEnum extends FilteredTermEnum {
    double distance;
-    boolean fieldMatch = false;
    boolean endEnum = false;

    Term searchTerm = null;
    String field = "";
    String text = "";
    int textlen;
+    float minimumSimilarity;
+    double scale_factor;
+    
    
    public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
+      this(reader, term, 0.5f);
+    }
+    
+    public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException {
        super();
+        minimumSimilarity = minSimilarity;
+        scale_factor = 1.0f / (1.0f - minimumSimilarity);
        searchTerm = term;
        field = searchTerm.field();
        text = searchTerm.text();
@ -53,14 +61,14 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
            int targetlen = target.length();
            int dist = editDistance(text, target, textlen, targetlen);
            distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
-            return (distance > FUZZY_THRESHOLD);
+            return (distance > minimumSimilarity);
        }
        endEnum = true;
        return false;
    }
    
    protected final float difference() {
-        return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR);
+        return (float)((distance - minimumSimilarity) * scale_factor);
    }
    
    public final boolean endEnum() {
@ -71,9 +79,6 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
     * Compute Levenshtein distance
     ******************************/
    
-    public static final double FUZZY_THRESHOLD = 0.5;
-    public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD);
-    
    /**
     Finds and returns the smallest of three integers 
     */
--- a/src/test/org/apache/lucene/search/TestFuzzyQuery.java
+++ b/src/test/org/apache/lucene/search/TestFuzzyQuery.java
@ -33,7 +33,7 @@ import org.apache.lucene.store.RAMDirectory;
 */
 public class TestFuzzyQuery extends TestCase {

-  public void testDefaultFuzziness() throws Exception {
+  public void testFuzziness() throws Exception {
    RAMDirectory directory = new RAMDirectory();
    IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
    addDoc("aaaaa", writer);
@ -90,7 +90,7 @@ public class TestFuzzyQuery extends TestCase {
    directory.close();
  }

-  public void testDefaultFuzzinessLong() throws Exception {
+  public void testFuzzinessLong() throws Exception {
    RAMDirectory directory = new RAMDirectory();
    IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
    addDoc("aaaaaaa", writer);
@ -124,6 +124,24 @@ public class TestFuzzyQuery extends TestCase {
    hits = searcher.search(query);
    assertEquals(1, hits.length());

+    // "student" doesn't match anymore thanks to increased minimum similarity:
+    query = new FuzzyQuery(new Term("field", "student"), 0.6f);   
+    hits = searcher.search(query);
+    assertEquals(0, hits.length());
+
+    try {
+      query = new FuzzyQuery(new Term("field", "student"), 1.1f);
+      fail("Expected IllegalArgumentException");
+    } catch (IllegalArgumentException e) {
+      // expecting exception
+    }
+    try {
+      query = new FuzzyQuery(new Term("field", "student"), -0.1f);
+      fail("Expected IllegalArgumentException");
+    } catch (IllegalArgumentException e) {
+      // expecting exception
+    }
+
    searcher.close();
    directory.close();
  }