LUCENE-3846: add some nocommits; carry over test from original patch

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1401358 13f79535-47bb-0310-9956-ffa450edef68
2012-10-23 17:39:59 +00:00 · 2012-10-23 17:39:59 +00:00 · 1e422e3e5e
parent 935e4b7aef
commit 1e422e3e5e
3 changed files with 298 additions and 3 deletions
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java
@ -139,6 +139,9 @@ public class Transition implements Cloneable {
  static void appendCharString(int c, StringBuilder b) {
    if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.appendCodePoint(c);
    else {
+      b.append("\\\\U" + Integer.toHexString(c));
+      // nocommit
+      /*
      b.append("\\\\U");
      String s = Integer.toHexString(c);
      if (c < 0x10) b.append("0000000").append(s);
@ -149,6 +152,7 @@ public class Transition implements Cloneable {
      else if (c < 0x1000000) b.append("00").append(s);
      else if (c < 0x10000000) b.append("0").append(s);
      else b.append(s);
+      */
    }
  }
  
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
@ -15,7 +15,10 @@ package org.apache.lucene.search.suggest.analyzing;
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Set;
@ -61,9 +64,13 @@ public final class FuzzySuggester extends AnalyzingSuggester {
  private final boolean transpositions;
  private final int minPrefix;

+  // nocommit separate param for "min length before we
+  // enable fuzzy"?  eg type "nusglasses" into google...
+  
  /**
   * The default minimum shared (non-fuzzy) prefix. Set to <tt>2</tt>
   */
+  // nocommit should we do 1...?
  public static final int DEFAULT_MIN_PREFIX = 2;
  
  /**
@ -156,7 +163,14 @@ public final class FuzzySuggester extends AnalyzingSuggester {
        Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, minPrefix);
        int ints[] = new int[path.length-minPrefix];
        System.arraycopy(path.ints, path.offset+minPrefix, ints, 0, ints.length);
-        LevenshteinAutomata lev = new LevenshteinAutomata(ints, 256, transpositions);
+        // nocommit i think we should pass 254 max?  ie
+        // exclude 0xff ... this way we can't 'edit away'
+        // the sep?  or ... maybe we want to allow that to
+        // be edited away?
+        // nocommit also the 0 byte ... we use that as
+        // trailer ... we probably shouldn't allow that byte
+        // to be edited (we could add alphaMin?)
+        LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
        Automaton levAutomaton = lev.toAutomaton(maxEdits);
        Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
        combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
@ -164,6 +178,11 @@ public final class FuzzySuggester extends AnalyzingSuggester {
        upto++;
      }
    }
+
+    // nocommit maybe we should reduce the LevN?  the added
+    // arcs add cost during intersect (extra FST arc
+    // lookups...).  could be net win...
+
    if (subs.length == 0) {
      return BasicAutomata.makeEmpty(); // matches nothing
    } else if (subs.length == 1) {
@ -186,8 +205,20 @@ public final class FuzzySuggester extends AnalyzingSuggester {

    @Override
    public List<Path<Pair<Long,BytesRef>>> intersectAll() throws IOException {
-      return  FSTUtil.intersectPrefixPaths(toLevenshteinAutomata(automaton),fst);
+      // nocommit we don't "penalize" for edits
+      // ... shouldn't we?  ie, ed=0 completions should have
+      // higher rank than ed=1, at the same "weight"?  maybe
+      // we can punt on this for starters ... or maybe we
+      // can re-run each prefix path through lev0, lev1,
+      // lev2 to figure out the number of edits?
+      Automaton levA = toLevenshteinAutomata(automaton);
+      /*
+      Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+      w.write(levA.toDot());
+      w.close();
+      System.out.println("Wrote LevA to out.dot");
+      */
+      return FSTUtil.intersectPrefixPaths(levA, fst);
    }
-    
  }
 }
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
@ -700,6 +700,9 @@ public class FuzzySuggesterTest extends LuceneTestCase {
        System.out.println("  analyzed: " + analyzedKey);
      }
      TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();
+
+      // nocommit this is putting fox in charge of hen
+      // house!  ie maybe we have a bug in suggester.toLevA ...
      Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey));
      assertTrue(automaton.isDeterministic());
      // TODO: could be faster... but its slowCompletor for a reason
@ -834,4 +837,261 @@ public class FuzzySuggesterTest extends LuceneTestCase {

    return builder.toString();
  }
+
+  private String randomSimpleString(int maxLen) {
+    final int len = _TestUtil.nextInt(random(), 1, maxLen);
+    final char[] chars = new char[len];
+    for(int j=0;j<len;j++) {
+      chars[j] = (char) ('a' + random().nextInt(4));
+    }
+    return new String(chars);
+  }
+
+  public void testRandom2() throws Throwable {
+    final int NUM = atLeast(200);
+    final List<TermFreq> answers = new ArrayList<TermFreq>();
+    final Set<String> seen = new HashSet<String>();
+    for(int i=0;i<NUM;i++) {
+      // nocommit mixin some unicode here?
+      final String s = randomSimpleString(8);
+      if (!seen.contains(s)) {
+        answers.add(new TermFreq(s, random().nextInt(1000)));
+        seen.add(s);
+      }
+    }
+
+    Collections.sort(answers, new Comparator<TermFreq>() {
+        @Override
+        public int compare(TermFreq a, TermFreq b) {
+          return a.term.compareTo(b.term);
+        }
+      });
+    if (VERBOSE) {
+      System.out.println("\nTEST: targets");
+      for(TermFreq tf : answers) {
+        System.out.println("  " + tf.term.utf8ToString() + " freq=" + tf.v);
+      }
+    }
+
+    Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
+    int maxEdits = random().nextBoolean() ? 1 : 2;
+    int prefixLen = random().nextInt(4);
+    boolean transpositions = random().nextBoolean();
+    // TODO: test graph analyzers
+    // TODO: test exactFirst / preserveSep permutations
+    FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen);
+
+    if (VERBOSE) {
+      System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM);
+    }
+
+    Collections.shuffle(answers, random());
+    suggest.build(new TermFreqArrayIterator(answers.toArray(new TermFreq[answers.size()])));
+
+    final int ITERS = atLeast(100);
+    for(int iter=0;iter<ITERS;iter++) {
+      final String frag = randomSimpleString(6);
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter frag=" + frag);
+      }
+      final List<LookupResult> expected = slowFuzzyMatch(prefixLen, maxEdits, transpositions, answers, frag);
+      if (VERBOSE) {
+        System.out.println("  expected: " + expected.size());
+        for(LookupResult c : expected) {
+          System.out.println("    " + c);
+        }
+      }
+      final List<LookupResult> actual = suggest.lookup(frag, false, NUM);
+      if (VERBOSE) {
+        System.out.println("  actual: " + actual.size());
+        for(LookupResult c : actual) {
+          System.out.println("    " + c);
+        }
+      }
+
+      // nocommit must fix lookup to tie break properly!!:
+      Collections.sort(actual, new CompareByCostThenAlpha());
+
+      final int limit = Math.min(expected.size(), actual.size());
+      for(int ans=0;ans<limit;ans++) {
+        final LookupResult c0 = expected.get(ans);
+        final LookupResult c1 = actual.get(ans);
+        assertEquals("expected " + c0.key +
+                     " but got " + c1.key,
+                     0,
+                     CHARSEQUENCE_COMPARATOR.compare(c0.key, c1.key));
+        assertEquals(c0.value, c1.value);
+      }
+      assertEquals(expected.size(), actual.size());
+    }
+  }
+
+  private List<LookupResult> slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List<TermFreq> answers, String frag) {
+    final List<LookupResult> results = new ArrayList<LookupResult>();
+    final int fragLen = frag.length();
+    for(TermFreq tf : answers) {
+      //System.out.println("  check s=" + tf.term.utf8ToString());
+      boolean prefixMatches = true;
+      for(int i=0;i<prefixLen;i++) {
+        if (i == fragLen) {
+          // Prefix still matches:
+          break;
+        }
+        if (i == tf.term.length || tf.term.bytes[i] != (byte) frag.charAt(i)) {
+          prefixMatches = false;
+          break;
+        }
+      }
+      //System.out.println("    prefixMatches=" + prefixMatches);
+
+      if (prefixMatches) {
+        final int len = tf.term.length;
+        if (len >= fragLen-maxEdits) {
+          // OK it's possible:
+          //System.out.println("    possible");
+          int d;
+          final String s = tf.term.utf8ToString();
+          if (fragLen == prefixLen) {
+            d = 0;
+          } else if (false && len < fragLen) {
+            d = getDistance(frag, s, allowTransposition);
+          } else {
+            //System.out.println("    try loop");
+            d = maxEdits + 1;
+            for(int ed=-maxEdits;ed<=maxEdits;ed++) {
+              if (s.length() < fragLen - ed) {
+                continue;
+              }
+              String check = s.substring(0, fragLen-ed);
+              d = getDistance(frag, check, allowTransposition);
+              //System.out.println("    sub check s=" + check + " d=" + d);
+              if (d <= maxEdits) {
+                break;
+              }
+            }
+          }
+          if (d <= maxEdits) {
+            results.add(new LookupResult(tf.term.utf8ToString(), tf.v));
+          }
+        }
+      }
+
+      Collections.sort(results, new CompareByCostThenAlpha());
+    }
+
+    return results;
+  }
+
+  private static class CharSequenceComparator implements Comparator<CharSequence> {
+
+    @Override
+    public int compare(CharSequence o1, CharSequence o2) {
+      final int l1 = o1.length();
+      final int l2 = o2.length();
+      
+      final int aStop = Math.min(l1, l2);
+      for (int i = 0; i < aStop; i++) {
+        int diff = o1.charAt(i) - o2.charAt(i);
+        if (diff != 0) {
+          return diff;
+        }
+      }
+      // One is a prefix of the other, or, they are equal:
+      return l1 - l2;
+    }
+  }
+
+  private static final Comparator<CharSequence> CHARSEQUENCE_COMPARATOR = new CharSequenceComparator();
+
+  public class CompareByCostThenAlpha implements Comparator<LookupResult> {
+    @Override
+    public int compare(LookupResult a, LookupResult b) {
+      if (a.value > b.value) {
+        return -1;
+      } else if (a.value < b.value) {
+        return 1;
+      } else {
+        final int c = CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
+        assert c != 0: "term=" + a.key;
+        return c;
+      }
+    }
+  }
+
+  // NOTE: copied from
+  // modules/suggest/src/java/org/apache/lucene/search/spell/LuceneLevenshteinDistance.java
+  // and tweaked to return the edit distance not the float
+  // lucene measure
+
+  /* Finds unicode (code point) Levenstein (edit) distance
+   * between two strings, including transpositions. */
+  public int getDistance(String target, String other, boolean allowTransposition) {
+    IntsRef targetPoints;
+    IntsRef otherPoints;
+    int n;
+    int d[][]; // cost array
+    
+    // NOTE: if we cared, we could 3*m space instead of m*n space, similar to 
+    // what LevenshteinDistance does, except cycling thru a ring of three 
+    // horizontal cost arrays... but this comparator is never actually used by 
+    // DirectSpellChecker, its only used for merging results from multiple shards 
+    // in "distributed spellcheck", and its inefficient in other ways too...
+
+    // cheaper to do this up front once
+    targetPoints = toIntsRef(target);
+    otherPoints = toIntsRef(other);
+    n = targetPoints.length;
+    final int m = otherPoints.length;
+    d = new int[n+1][m+1];
+    
+    if (n == 0 || m == 0) {
+      if (n == m) {
+        return 0;
+      }
+      else {
+        return Math.max(n, m);
+      }
+    } 
+
+    // indexes into strings s and t
+    int i; // iterates through s
+    int j; // iterates through t
+
+    int t_j; // jth character of t
+
+    int cost; // cost
+
+    for (i = 0; i<=n; i++) {
+      d[i][0] = i;
+    }
+    
+    for (j = 0; j<=m; j++) {
+      d[0][j] = j;
+    }
+
+    for (j = 1; j<=m; j++) {
+      t_j = otherPoints.ints[j-1];
+
+      for (i=1; i<=n; i++) {
+        cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
+        // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+        d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
+        // transposition
+        if (allowTransposition && i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
+          d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
+        }
+      }
+    }
+    
+    return d[n][m];
+  }
+  
+  private static IntsRef toIntsRef(String s) {
+    IntsRef ref = new IntsRef(s.length()); // worst case
+    int utf16Len = s.length();
+    for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
+      cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
+    }
+    return ref;
+  }
 }