speed up Fuzzy queries 20%-50%, I only did some small whitespace and comment fixes compared to the original patch

PR: 31882 Submitted by: Jonathan Hager git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150628 13f79535-47bb-0310-9956-ffa450edef68
2004-11-07 23:27:24 +00:00 · 2004-11-07 23:27:24 +00:00 · 898aa6abe6
parent c49da536de
commit 898aa6abe6
1 changed files with 201 additions and 95 deletions
--- a/src/java/org/apache/lucene/search/FuzzyTermEnum.java
+++ b/src/java/org/apache/lucene/search/FuzzyTermEnum.java
@ -16,30 +16,44 @@ package org.apache.lucene.search;
 * limitations under the License.
 */

-import java.io.IOException;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;

-/** Subclass of FilteredTermEnum for enumerating all terms that are similiar to the specified filter term.
+import java.io.IOException;

-  <p>Term enumerations are always ordered by Term.compareTo().  Each term in
-  the enumeration is greater than all that precede it.  */
+/** Subclass of FilteredTermEnum for enumerating all terms that are similiar
+ * to the specified filter term.
+ *
+ * <p>Term enumerations are always ordered by Term.compareTo().  Each term in
+ * the enumeration is greater than all that precede it.
+ */
 public final class FuzzyTermEnum extends FilteredTermEnum {
-    float similarity;
-    boolean endEnum = false;

-    Term searchTerm = null;
-    String field = "";
-    String text = "";
-    int textlen;
-    String prefix = "";
-    int prefixLength = 0;
-    float minimumSimilarity;
-    float scale_factor;
+    /* This should be somewhere around the average long word.
+     * If it is longer, we waste time and space. If it is shorter, we waste a
+     * little bit of time growing the array as we encounter longer words.
+     */
+    private static final int TYPICAL_LONGEST_WORD_IN_INDEX = 19;

+    /* Allows us save time required to create a new array
+     * everytime similarity is called.
+     */
+    private int[][] d;
+
+    private float similarity;
+    private boolean endEnum = false;
+
+    private Term searchTerm = null;
+    private final String field;
+    private final String text;
+    private final String prefix;
+
+    private final float minimumSimilarity;
+    private final float scale_factor;
+    private final int[] maxDistances = new int[TYPICAL_LONGEST_WORD_IN_INDEX];

    /**
-     * Empty prefix and minSimilarity of 0.5f are used.
+     * Creates a FuzzyTermEnum with an empty prefix and a minSimilarity of 0.5f.
     * 
     * @param reader
     * @param term
@ -51,7 +65,7 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
    }
    
    /**
-     * This is the standard FuzzyTermEnum with an empty prefix.
+     * Creates a FuzzyTermEnum with an empty prefix.
     * 
     * @param reader
     * @param term
@ -74,46 +88,43 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
     * @param prefixLength Length of required common prefix. Default value is 0.
     * @throws IOException
     */
-    public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength) throws IOException {
+    public FuzzyTermEnum(IndexReader reader, Term term, final float minSimilarity, final int prefixLength) throws IOException {
        super();
        
-        if (minimumSimilarity >= 1.0f)
-          throw new IllegalArgumentException("minimumSimilarity >= 1");
-        else if (minimumSimilarity < 0.0f)
-          throw new IllegalArgumentException("minimumSimilarity < 0");
-        
-        minimumSimilarity = minSimilarity;
-        scale_factor = 1.0f / (1.0f - minimumSimilarity);
-        searchTerm = term;
-        field = searchTerm.field();
-        text = searchTerm.text();
-        textlen = text.length();
-        
+        if (minSimilarity >= 1.0f)
+          throw new IllegalArgumentException("minimumSimilarity cannot be greater than or equal to 1");
+        else if (minSimilarity < 0.0f)
+          throw new IllegalArgumentException("minimumSimilarity cannot be less than 0");
        if(prefixLength < 0)
-          throw new IllegalArgumentException("prefixLength < 0");
+          throw new IllegalArgumentException("prefixLength cannot be less than 0");

-        if(prefixLength > textlen)
-          prefixLength = textlen;
+        this.minimumSimilarity = minSimilarity;
+        this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
+        this.searchTerm = term;
+        this.field = searchTerm.field();

-        this.prefixLength = prefixLength;
-        prefix = text.substring(0, prefixLength);
-        text = text.substring(prefixLength);
-        textlen = text.length();
+        //The prefix could be longer than the word.
+        //It's kind of silly though.  It means we must match the entire word.
+        final int fullSearchTermLength = searchTerm.text().length();
+        final int realPrefixLength = prefixLength > fullSearchTermLength ? fullSearchTermLength : prefixLength;
+
+        this.text = searchTerm.text().substring(realPrefixLength);
+        this.prefix = searchTerm.text().substring(0, realPrefixLength);
+
+        initializeMaxDistances();
+        this.d = initDistanceArray();

        setEnum(reader.terms(new Term(searchTerm.field(), prefix)));
    }

    /**
-     The termCompare method in FuzzyTermEnum uses Levenshtein distance to 
-     calculate the distance between the given term and the comparing term. 
+     * The termCompare method in FuzzyTermEnum uses Levenshtein distance to 
+     * calculate the distance between the given term and the comparing term. 
     */
    protected final boolean termCompare(Term term) {
-        String termText = term.text();
-        if (field == term.field() && termText.startsWith(prefix)) {
-            String target = termText.substring(prefixLength);
-            int targetlen = target.length();
-            int dist = editDistance(text, target, textlen, targetlen);
-            similarity = 1 - ((float)dist / (float) (prefixLength + Math.min(textlen, targetlen)));
+        if (field == term.field() && term.text().startsWith(prefix)) {
+            final String target = term.text().substring(prefix.length());
+            this.similarity = similarity(target);
            return (similarity > minimumSimilarity);
        }
        endEnum = true;
@ -133,62 +144,157 @@ public final class FuzzyTermEnum extends FilteredTermEnum {
     ******************************/
    
    /**
-     Finds and returns the smallest of three integers 
+     * Finds and returns the smallest of three integers 
     */
    private static final int min(int a, int b, int c) {
-        int t = (a < b) ? a : b;
+        final int t = (a < b) ? a : b;
        return (t < c) ? t : c;
    }

-    /**
-     * This static array saves us from the time required to create a new array
-     * everytime editDistance is called.
-     */
-    private int e[][] = new int[1][1];
+  private final int[][] initDistanceArray(){
+    return new int[this.text.length() + 1][TYPICAL_LONGEST_WORD_IN_INDEX];
+  }

-    /**
-     Levenshtein distance also known as edit distance is a measure of similiarity
-     between two strings where the distance is measured as the number of character 
-     deletions, insertions or substitutions required to transform one string to 
-     the other string. 
-     <p>This method takes in four parameters; two strings and their respective 
-     lengths to compute the Levenshtein distance between the two strings.
-     The result is returned as an integer.
-     */ 
-    private final int editDistance(String s, String t, int n, int m) {
-        if (e.length <= n || e[0].length <= m) {
-            e = new int[Math.max(e.length, n+1)][Math.max(e[0].length, m+1)];
+  /**
+   * <p>Similarity returns a number that is 1.0f or less (including negative numbers)
+   * based on how similar the Term is compared to a target term.  It returns
+   * exactly 0.0f when
+   * <pre>
+   *    editDistance &lt; maximumEditDistance</pre>
+   * Otherwise it returns:
+   * <pre>
+   *    1 - (editDistance / length)</pre>
+   * where length is the length of the shortest term (text or target) including a
+   * prefix that are identical and editDistance is the Levenshtein distance for
+   * the two words.</p>
+   *
+   * <p>Embedded within this algorithm is a fail-fast Levenshtein distance
+   * algorithm.  The fail-fast algorithm differs from the standard Levenshtein
+   * distance algorithm in that it is aborted if it is discovered that the
+   * mimimum distance between the words is greater than some threshold.
+   *
+   * <p>To calculate the maximum distance threshold we use the following formula:
+   * <pre>
+   *     (1 - minimumSimilarity) / length</pre>
+   * where length is the shortest term including any prefix that is not part of the
+   * similarity comparision.  This formula was derived by solving for what maximum value
+   * of distance returns false for the following statements:
+   * <pre>
+   *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
+   *   return (similarity > minimumSimilarity);</pre>
+   * where distance is the Levenshtein distance for the two words.
+   * </p>
+   * <p>Levenshtein distance (also known as edit distance) is a measure of similiarity
+   * between two strings where the distance is measured as the number of character
+   * deletions, insertions or substitutions required to transform one string to
+   * the other string.
+   * @param target the target word or phrase
+   * @return the similarity,  0.0 or less indicates that it matches less than the required
+   * threshold and 1.0 indicates that the text and target are identical
+   */
+    private synchronized final float similarity(final String target) {
+        final int m = target.length();
+        final int n = text.length();
+        if (n == 0)  {
+          //we don't have antyhing to compare.  That means if we just add
+          //the letters for m we get the new word
+          return prefix.length() == 0 ? 0.0f : 1.0f - ((float) m / prefix.length());
+        }
+        if (m == 0) {
+          return prefix.length() == 0 ? 0.0f : 1.0f - ((float) n / prefix.length());
        }
-        int d[][] = e; // matrix
-        int i; // iterates through s
-        int j; // iterates through t
-        char s_i; // ith character of s

-        if (n == 0) return m;
-        if (m == 0) return n;
+        final int maxDistance = getMaxDistance(m);
+
+        if (maxDistance < Math.abs(m-n)) {
+          //just adding the characters of m to n or vice-versa results in
+          //too many edits
+          //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
+          //given this optimal circumstance, the edit distance cannot be less than 5.
+          //which is 8-3 or more precisesly Math.abs(3-8).
+          //if our maximum edit distance is 4, than we can discard this word
+          //without looking at it.
+          return 0.0f;
+        }
+
+        //let's make sure we have enough room in our array to do the distance calculations.
+        if (d[0].length <= m) {
+          growDistanceArray(m);
+        }

        // init matrix d
-        for (i = 0; i <= n; i++) d[i][0] = i;
-        for (j = 0; j <= m; j++) d[0][j] = j;
+        for (int i = 0; i <= n; i++) d[i][0] = i;
+        for (int j = 0; j <= m; j++) d[0][j] = j;
        
        // start computing edit distance
-        for (i = 1; i <= n; i++) {
-            s_i = s.charAt(i - 1);
-            for (j = 1; j <= m; j++) {
-                if (s_i != t.charAt(j-1))
+        for (int i = 1; i <= n; i++) {
+            int bestPossibleEditDistance = m;
+            final char s_i = text.charAt(i - 1);
+            for (int j = 1; j <= m; j++) {
+                if (s_i != target.charAt(j-1)) {
                    d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
-                else d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
+                }
+                else {
+                  d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
+                }
+                bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i][j]);
            }
+
+          //After calculating row i, the best possible edit distance
+          //can be found by found by finding the smallest value in a given column.
+          //If the bestPossibleEditDistance is greater than the max distance, abort.
+
+          if (i > maxDistance && bestPossibleEditDistance > maxDistance) {  //equal is okay, but not greater
+            //the closest the target can be to the text is just too far away.
+            //this target is leaving the party early.
+            return 0.0f;
+          }
        }

-        // we got the result!
-        return d[n][m];
+        // this will return less than 0.0 when the edit distance is
+        // greater than the number of characters in the shorter word.
+        // but this was the formula that was previously used in FuzzyTermEnum,
+        // so it has not been changed (even though minimumSimilarity must be
+        // greater than 0.0)
+        return 1.0f - ((float)d[n][m] / (float) (prefix.length() + Math.min(n, m)));
+
    }

-  public void close() throws IOException {
-      super.close();
-      searchTerm = null;
-      field = null;
-      text = null;
+  /**
+   * Grow the second dimension of the array, so that we can calculate the
+   * Levenshtein difference.
+   */
+  private void growDistanceArray(int m) {
+    for (int i = 0; i < d.length; i++)
+    {
+      d[i] = new int[m+1];
+    }
  }
+
+  /**
+   * The max Distance is the maximum Levenshtein distance for the text
+   * compared to some other value that results in score that is
+   * better than the minimum similarity.
+   * @param m the length of the "other value"
+   * @return the maximum levenshtein distance that we care about
+   */
+  private final int getMaxDistance(int m) {
+    return (m < maxDistances.length) ? maxDistances[m] : calculateMaxDistance(m);
+  }
+
+  private void initializeMaxDistances() {
+    for (int i = 0; i < maxDistances.length; i++)
+    {
+      maxDistances[i] = calculateMaxDistance(i);
+    }
+  }
+  
+  private int calculateMaxDistance(int m) {
+    return (int) ((1-minimumSimilarity) * (Math.min(text.length(), m) + prefix.length()));
+  }
+
+  public void close() throws IOException {
+    super.close();  //call super.close() and let the garbage collector do its work.
+  }
+  
 }