LUCENE-3846: add some nocommits; carry over test from original patch

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1401358 13f79535-47bb-0310-9956-ffa450edef68
2012-10-23 17:39:59 +00:00 · 2012-10-23 17:39:59 +00:00 · 1e422e3e5e
parent 935e4b7aef
commit 1e422e3e5e
3 changed files with 298 additions and 3 deletions
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Transition.java
@ -139,6 +139,9 @@ public class Transition implements Cloneable {
  static void appendCharString(int c, StringBuilder b) {
    if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.appendCodePoint(c);
    else {
      b.append("\\\\U" + Integer.toHexString(c));
      // nocommit
      /*
      b.append("\\\\U");
      String s = Integer.toHexString(c);
      if (c < 0x10) b.append("0000000").append(s);
@ -149,6 +152,7 @@ public class Transition implements Cloneable {
      else if (c < 0x1000000) b.append("00").append(s);
      else if (c < 0x10000000) b.append("0").append(s);
      else b.append(s);
      */
    }
  }
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
@ -15,7 +15,10 @@ package org.apache.lucene.search.suggest.analyzing;
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Set;
@ -60,10 +63,14 @@ public final class FuzzySuggester extends AnalyzingSuggester {
  private final int maxEdits;
  private final boolean transpositions;
  private final int minPrefix;
  // nocommit separate param for "min length before we
  // enable fuzzy"?  eg type "nusglasses" into google...
  /**
   * The default minimum shared (non-fuzzy) prefix. Set to <tt>2</tt>
   */
  // nocommit should we do 1...?
  public static final int DEFAULT_MIN_PREFIX = 2;
  /**
@ -156,7 +163,14 @@ public final class FuzzySuggester extends AnalyzingSuggester {
        Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, minPrefix);
        int ints[] = new int[path.length-minPrefix];
        System.arraycopy(path.ints, path.offset+minPrefix, ints, 0, ints.length);
-        LevenshteinAutomata lev = new LevenshteinAutomata(ints, 256, transpositions);
+        // nocommit i think we should pass 254 max?  ie
        // exclude 0xff ... this way we can't 'edit away'
        // the sep?  or ... maybe we want to allow that to
        // be edited away?
        // nocommit also the 0 byte ... we use that as
        // trailer ... we probably shouldn't allow that byte
        // to be edited (we could add alphaMin?)
        LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
        Automaton levAutomaton = lev.toAutomaton(maxEdits);
        Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
        combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
@ -164,6 +178,11 @@ public final class FuzzySuggester extends AnalyzingSuggester {
        upto++;
      }
    }
    // nocommit maybe we should reduce the LevN?  the added
    // arcs add cost during intersect (extra FST arc
    // lookups...).  could be net win...
    if (subs.length == 0) {
      return BasicAutomata.makeEmpty(); // matches nothing
    } else if (subs.length == 1) {
@ -186,8 +205,20 @@ public final class FuzzySuggester extends AnalyzingSuggester {
    @Override
    public List<Path<Pair<Long,BytesRef>>> intersectAll() throws IOException {
-      return  FSTUtil.intersectPrefixPaths(toLevenshteinAutomata(automaton),fst);
+      // nocommit we don't "penalize" for edits
      // ... shouldn't we?  ie, ed=0 completions should have
      // higher rank than ed=1, at the same "weight"?  maybe
      // we can punt on this for starters ... or maybe we
      // can re-run each prefix path through lev0, lev1,
      // lev2 to figure out the number of edits?
      Automaton levA = toLevenshteinAutomata(automaton);
      /*
      Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
      w.write(levA.toDot());
      w.close();
      System.out.println("Wrote LevA to out.dot");
      */
      return FSTUtil.intersectPrefixPaths(levA, fst);
    }
  }
 }
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
@ -700,6 +700,9 @@ public class FuzzySuggesterTest extends LuceneTestCase {
        System.out.println("  analyzed: " + analyzedKey);
      }
      TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton();
      // nocommit this is putting fox in charge of hen
      // house!  ie maybe we have a bug in suggester.toLevA ...
      Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey));
      assertTrue(automaton.isDeterministic());
      // TODO: could be faster... but its slowCompletor for a reason
@ -834,4 +837,261 @@ public class FuzzySuggesterTest extends LuceneTestCase {
    return builder.toString();
  }
  private String randomSimpleString(int maxLen) {
    final int len = _TestUtil.nextInt(random(), 1, maxLen);
    final char[] chars = new char[len];
    for(int j=0;j<len;j++) {
      chars[j] = (char) ('a' + random().nextInt(4));
    }
    return new String(chars);
  }
  public void testRandom2() throws Throwable {
    final int NUM = atLeast(200);
    final List<TermFreq> answers = new ArrayList<TermFreq>();
    final Set<String> seen = new HashSet<String>();
    for(int i=0;i<NUM;i++) {
      // nocommit mixin some unicode here?
      final String s = randomSimpleString(8);
      if (!seen.contains(s)) {
        answers.add(new TermFreq(s, random().nextInt(1000)));
        seen.add(s);
      }
    }
    Collections.sort(answers, new Comparator<TermFreq>() {
        @Override
        public int compare(TermFreq a, TermFreq b) {
          return a.term.compareTo(b.term);
        }
      });
    if (VERBOSE) {
      System.out.println("\nTEST: targets");
      for(TermFreq tf : answers) {
        System.out.println("  " + tf.term.utf8ToString() + " freq=" + tf.v);
      }
    }
    Analyzer a = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
    int maxEdits = random().nextBoolean() ? 1 : 2;
    int prefixLen = random().nextInt(4);
    boolean transpositions = random().nextBoolean();
    // TODO: test graph analyzers
    // TODO: test exactFirst / preserveSep permutations
    FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen);
    if (VERBOSE) {
      System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM);
    }
    Collections.shuffle(answers, random());
    suggest.build(new TermFreqArrayIterator(answers.toArray(new TermFreq[answers.size()])));
    final int ITERS = atLeast(100);
    for(int iter=0;iter<ITERS;iter++) {
      final String frag = randomSimpleString(6);
      if (VERBOSE) {
        System.out.println("\nTEST: iter frag=" + frag);
      }
      final List<LookupResult> expected = slowFuzzyMatch(prefixLen, maxEdits, transpositions, answers, frag);
      if (VERBOSE) {
        System.out.println("  expected: " + expected.size());
        for(LookupResult c : expected) {
          System.out.println("    " + c);
        }
      }
      final List<LookupResult> actual = suggest.lookup(frag, false, NUM);
      if (VERBOSE) {
        System.out.println("  actual: " + actual.size());
        for(LookupResult c : actual) {
          System.out.println("    " + c);
        }
      }
      // nocommit must fix lookup to tie break properly!!:
      Collections.sort(actual, new CompareByCostThenAlpha());
      final int limit = Math.min(expected.size(), actual.size());
      for(int ans=0;ans<limit;ans++) {
        final LookupResult c0 = expected.get(ans);
        final LookupResult c1 = actual.get(ans);
        assertEquals("expected " + c0.key +
                     " but got " + c1.key,
                     0,
                     CHARSEQUENCE_COMPARATOR.compare(c0.key, c1.key));
        assertEquals(c0.value, c1.value);
      }
      assertEquals(expected.size(), actual.size());
    }
  }
  private List<LookupResult> slowFuzzyMatch(int prefixLen, int maxEdits, boolean allowTransposition, List<TermFreq> answers, String frag) {
    final List<LookupResult> results = new ArrayList<LookupResult>();
    final int fragLen = frag.length();
    for(TermFreq tf : answers) {
      //System.out.println("  check s=" + tf.term.utf8ToString());
      boolean prefixMatches = true;
      for(int i=0;i<prefixLen;i++) {
        if (i == fragLen) {
          // Prefix still matches:
          break;
        }
        if (i == tf.term.length || tf.term.bytes[i] != (byte) frag.charAt(i)) {
          prefixMatches = false;
          break;
        }
      }
      //System.out.println("    prefixMatches=" + prefixMatches);
      if (prefixMatches) {
        final int len = tf.term.length;
        if (len >= fragLen-maxEdits) {
          // OK it's possible:
          //System.out.println("    possible");
          int d;
          final String s = tf.term.utf8ToString();
          if (fragLen == prefixLen) {
            d = 0;
          } else if (false && len < fragLen) {
            d = getDistance(frag, s, allowTransposition);
          } else {
            //System.out.println("    try loop");
            d = maxEdits + 1;
            for(int ed=-maxEdits;ed<=maxEdits;ed++) {
              if (s.length() < fragLen - ed) {
                continue;
              }
              String check = s.substring(0, fragLen-ed);
              d = getDistance(frag, check, allowTransposition);
              //System.out.println("    sub check s=" + check + " d=" + d);
              if (d <= maxEdits) {
                break;
              }
            }
          }
          if (d <= maxEdits) {
            results.add(new LookupResult(tf.term.utf8ToString(), tf.v));
          }
        }
      }
      Collections.sort(results, new CompareByCostThenAlpha());
    }
    return results;
  }
  private static class CharSequenceComparator implements Comparator<CharSequence> {
    @Override
    public int compare(CharSequence o1, CharSequence o2) {
      final int l1 = o1.length();
      final int l2 = o2.length();
      final int aStop = Math.min(l1, l2);
      for (int i = 0; i < aStop; i++) {
        int diff = o1.charAt(i) - o2.charAt(i);
        if (diff != 0) {
          return diff;
        }
      }
      // One is a prefix of the other, or, they are equal:
      return l1 - l2;
    }
  }
  private static final Comparator<CharSequence> CHARSEQUENCE_COMPARATOR = new CharSequenceComparator();
  public class CompareByCostThenAlpha implements Comparator<LookupResult> {
    @Override
    public int compare(LookupResult a, LookupResult b) {
      if (a.value > b.value) {
        return -1;
      } else if (a.value < b.value) {
        return 1;
      } else {
        final int c = CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
        assert c != 0: "term=" + a.key;
        return c;
      }
    }
  }
  // NOTE: copied from
  // modules/suggest/src/java/org/apache/lucene/search/spell/LuceneLevenshteinDistance.java
  // and tweaked to return the edit distance not the float
  // lucene measure
  /* Finds unicode (code point) Levenstein (edit) distance
   * between two strings, including transpositions. */
  public int getDistance(String target, String other, boolean allowTransposition) {
    IntsRef targetPoints;
    IntsRef otherPoints;
    int n;
    int d[][]; // cost array
    // NOTE: if we cared, we could 3*m space instead of m*n space, similar to 
    // what LevenshteinDistance does, except cycling thru a ring of three 
    // horizontal cost arrays... but this comparator is never actually used by 
    // DirectSpellChecker, its only used for merging results from multiple shards 
    // in "distributed spellcheck", and its inefficient in other ways too...
    // cheaper to do this up front once
    targetPoints = toIntsRef(target);
    otherPoints = toIntsRef(other);
    n = targetPoints.length;
    final int m = otherPoints.length;
    d = new int[n+1][m+1];
    if (n == 0 || m == 0) {
      if (n == m) {
        return 0;
      }
      else {
        return Math.max(n, m);
      }
    } 
    // indexes into strings s and t
    int i; // iterates through s
    int j; // iterates through t
    int t_j; // jth character of t
    int cost; // cost
    for (i = 0; i<=n; i++) {
      d[i][0] = i;
    }
    for (j = 0; j<=m; j++) {
      d[0][j] = j;
    }
    for (j = 1; j<=m; j++) {
      t_j = otherPoints.ints[j-1];
      for (i=1; i<=n; i++) {
        cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
        // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
        d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
        // transposition
        if (allowTransposition && i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
          d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
        }
      }
    }
    return d[n][m];
  }
  private static IntsRef toIntsRef(String s) {
    IntsRef ref = new IntsRef(s.length()); // worst case
    int utf16Len = s.length();
    for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
      cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
    }
    return ref;
  }
 }