LUCENE-3846: remove allowSepEdit (it doesn't work); add test

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1403413 13f79535-47bb-0310-9956-ffa450edef68
2012-10-29 16:43:03 +00:00 · 2012-10-29 16:43:03 +00:00 · c9a38e3236
parent bd8ef39a3b
commit c9a38e3236
3 changed files with 41 additions and 24 deletions
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
@ -36,7 +36,7 @@ public class LevenshteinAutomata {
  final int word[];
  /* the automata alphabet. */
  final int alphabet[];
-  /* the maximum symbol in the alphabet (e.g. 256 for UTF-8 or 10FFFF for UTF-32) */
+  /* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */
  final int alphaMax;

  /* the ranges outside of alphabet */
@ -53,18 +53,24 @@ public class LevenshteinAutomata {
  public LevenshteinAutomata(String input, boolean withTranspositions) {
    this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions);
  }
-  
+
  /**
-   * Expert: Don't use this!
+   * Expert: specify a custom maximum possible symbol
+   * (alphaMax); default is Character.MAX_CODE_POINT.
   */
  public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) {
    this.word = word;
    this.alphaMax = alphaMax;
-    
+
    // calculate the alphabet
    SortedSet<Integer> set = new TreeSet<Integer>();
-    for (int i = 0; i < word.length; i++)
-      set.add(word[i]);
+    for (int i = 0; i < word.length; i++) {
+      int v = word[i];
+      if (v > alphaMax) {
+        throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word");
+      }
+      set.add(v);
+    }
    alphabet = new int[set.size()];
    Iterator<Integer> iterator = set.iterator();
    for (int i = 0; i < alphabet.length; i++)
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
@ -72,7 +72,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
  private final boolean transpositions;
  private final int nonFuzzyPrefix;
  private final int minFuzzyLength;
-  private final boolean allowSepEdit;

  /**
   * The default minimum length of the key passed to {@link
@ -91,11 +90,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
   */
  public static final int DEFAULT_MAX_EDITS = 1;

-  /**
-   * We allow token separator to be deleted/inserted, by default.
-   */
-  public static final boolean DEFAULT_ALLOW_SEP_EDIT = true;
-  
  /**
   * Creates a {@link FuzzySuggester} instance initialized with default values.
   * 
@ -115,7 +109,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
   */
  public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
    this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true,
-         DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_ALLOW_SEP_EDIT);
+         DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);
  }

  /**
@ -139,12 +133,11 @@ public final class FuzzySuggester extends AnalyzingSuggester {
   *        Levenshtein algorithm.
   * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
   * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
-   * @param allowSepEdit if true, the token separater is allowed to be an edit (so words may be split/joined) (see default {@link #DEFAULT_ALLOW_SEP_EDIT})
   */
  public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
                        int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
                        int maxEdits, boolean transpositions, int nonFuzzyPrefix,
-                        int minFuzzyLength, boolean allowSepEdit) {
+                        int minFuzzyLength) {
    super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
    if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
      throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
@ -160,7 +153,6 @@ public final class FuzzySuggester extends AnalyzingSuggester {
    this.transpositions = transpositions;
    this.nonFuzzyPrefix = nonFuzzyPrefix;
    this.minFuzzyLength = minFuzzyLength;
-    this.allowSepEdit = allowSepEdit;
  }
  
  @Override
@ -206,7 +198,7 @@ public final class FuzzySuggester extends AnalyzingSuggester {
        // to allow the trailing dedup bytes to be
        // edited... but then 0 byte is "in general" allowed
        // on input (but not in UTF8).
-        LevenshteinAutomata lev = new LevenshteinAutomata(ints, allowSepEdit ? 255 : 254, transpositions);
+        LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
        Automaton levAutomaton = lev.toAutomaton(maxEdits);
        Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
        combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
@ -184,7 +184,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
    int options = 0;

    Analyzer a = new MockAnalyzer(random());
-    FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3, true);
+    FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, true, 1, 3);
    suggester.build(new TermFreqArrayIterator(keys));
    // TODO: would be nice if "ab " would allow the test to
    // pass, and more generally if the analyzer can know
@ -387,7 +387,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
  public void testExactFirst() throws Exception {

    Analyzer a = getUnusualAnalyzer();
-    FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true);
+    FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);
    suggester.build(new TermFreqArrayIterator(new TermFreq[] {
          new TermFreq("x y", 1),
          new TermFreq("x y z", 3),
@ -426,7 +426,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
  public void testNonExactFirst() throws Exception {

    Analyzer a = getUnusualAnalyzer();
-    FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3, true);
+    FuzzySuggester suggester = new FuzzySuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1, 1, true, 1, 3);

    suggester.build(new TermFreqArrayIterator(new TermFreq[] {
          new TermFreq("x y", 1),
@ -645,7 +645,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {

    Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
    FuzzySuggester suggester = new FuzzySuggester(a, a,
-                                                  preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3, true);
+                                                  preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3);
    suggester.build(new TermFreqArrayIterator(keys));

    for (String prefix : allPrefixes) {
@ -775,10 +775,9 @@ public class FuzzySuggesterTest extends LuceneTestCase {
    }
  }

- 
  public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
    Analyzer a = new MockAnalyzer(random());
-    FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3, true);
+    FuzzySuggester suggester = new FuzzySuggester(a, a, 0, 2, -1, 1, true, 1, 3);

    List<TermFreq> keys = Arrays.asList(new TermFreq[] {
        new TermFreq("a", 40),
@ -796,6 +795,26 @@ public class FuzzySuggesterTest extends LuceneTestCase {
    assertEquals("a ", results.get(1).key);
    assertEquals(50, results.get(1).value);
  }
+
+  public void testEditSeps() throws Exception {
+    Analyzer a = new MockAnalyzer(random());
+    FuzzySuggester suggester = new FuzzySuggester(a, a, FuzzySuggester.PRESERVE_SEP, 2, -1, 2, true, 1, 3);
+
+    List<TermFreq> keys = Arrays.asList(new TermFreq[] {
+        new TermFreq("foo bar", 40),
+        new TermFreq("foo bar baz", 50),
+        new TermFreq("barbaz", 60),
+        new TermFreq("barbazfoo", 10),
+      });
+
+    Collections.shuffle(keys, random());
+    suggester.build(new TermFreqArrayIterator(keys));
+
+    assertEquals("[foo bar baz/50, foo bar/40]", suggester.lookup("foobar", false, 5).toString());
+    assertEquals("[foo bar baz/50]", suggester.lookup("foobarbaz", false, 5).toString());
+    assertEquals("[barbaz/60, barbazfoo/10]", suggester.lookup("bar baz", false, 5).toString());
+    assertEquals("[barbazfoo/10]", suggester.lookup("bar baz foo", false, 5).toString());
+  }
  
  private static String addRandomEdit(String string, int prefixLength) {
    char[] input = string.toCharArray();
@ -891,7 +910,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
    boolean transpositions = random().nextBoolean();
    // TODO: test graph analyzers
    // TODO: test exactFirst / preserveSep permutations
-    FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, 3, true);
+    FuzzySuggester suggest = new FuzzySuggester(a, a, 0, 256, -1, maxEdits, transpositions, prefixLen, 3);

    if (VERBOSE) {
      System.out.println("TEST: maxEdits=" + maxEdits + " prefixLen=" + prefixLen + " transpositions=" + transpositions + " num=" + NUM);