LUCENE-3846: cleanup tests, add javadocs, reenable assertions, prepare reintegration

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1399713 13f79535-47bb-0310-9956-ffa450edef68
2012-10-18 16:03:14 +00:00 · 2012-10-18 16:03:14 +00:00 · 83a1417bd5
parent 0c32e16186
commit 83a1417bd5
8 changed files with 160 additions and 137 deletions
--- a/lucene/common-build.xml
+++ b/lucene/common-build.xml
@ -819,11 +819,10 @@
            <classpath refid="@{junit.classpath}"/>
            <classpath refid="clover.classpath" />

-            <!-- Assertions.
            <assertions>
              <enable package="org.apache.lucene"/>
              <enable package="org.apache.solr"/>
-            </assertions> nocommit --> 
+            </assertions>  

            <!-- JVM arguments and system properties. -->
            <jvmarg line="${args}"/>
--- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
@ -857,6 +857,7 @@ public final class Util {
   */
  public static <T> Arc<T> readCeilArc(int label, FST<T> fst, Arc<T> follow,
      Arc<T> arc, BytesReader in) throws IOException {
+    // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum?
    if (label == FST.END_LABEL) {
      if (follow.isFinal()) {
        if (follow.target <= 0) {
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@ -679,24 +679,40 @@ public class AnalyzingSuggester extends Lookup {
    }
  };
  
+  /**
+   * Returns a new {@link PathIntersector}  
+   */
  protected PathIntersector getPathIntersector(Automaton automaton, FST<Pair<Long,BytesRef>> fst) {
    return new PathIntersector(automaton, fst);
  }
  
+  /**
+   * This class is used to obtain the prefix paths in the automaton that also intersect the FST.
+   */
  protected static class PathIntersector {
    protected List<FSTUtil.Path<Pair<Long,BytesRef>>> intersect; 
    protected final Automaton automaton;
    protected final FST<Pair<Long,BytesRef>> fst;
+    
+    /**
+     * Creates a new {@link PathIntersector}
+     */
    public PathIntersector(Automaton automaton, FST<Pair<Long,BytesRef>> fst) {
      this.automaton = automaton;
      this.fst = fst;
    }
+    /**
+     * Returns the prefix paths for exact first top N search. 
+     */
    public List<FSTUtil.Path<Pair<Long,BytesRef>>> intersectExact() throws IOException {
-      return intersect = FSTUtil.intersectPrefixPathsExact(automaton, fst);
+      return intersect = FSTUtil.intersectPrefixPaths(automaton, fst);
    }
    
+    /**
+     * Returns the prefix paths for top N search. 
+     */
    public List<FSTUtil.Path<Pair<Long,BytesRef>>> intersectAll() throws IOException {
-      return intersect == null ?  intersect = FSTUtil.intersectPrefixPathsExact(automaton, fst) : intersect;
+      return intersect == null ?  intersect = FSTUtil.intersectPrefixPaths(automaton, fst) : intersect;
    }
  }
 }
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
@ -64,63 +64,12 @@ public class FSTUtil {
    }
  }

-  /** Enumerates all paths in the automaton that also
-   *  intersect the FST, accumulating the FST end node and
-   *  output for each path. */
-  public static<T> List<Path<T>> intersectPrefixPathsExact(Automaton a, FST<T> fst) throws IOException {
-    final List<Path<T>> queue = new ArrayList<Path<T>>();
-    final List<Path<T>> endNodes = new ArrayList<Path<T>>();
-
-    queue.add(new Path<T>(a.getInitialState(),
-                          fst.getFirstArc(new FST.Arc<T>()),       
-                          fst.outputs.getNoOutput(),
-                          new IntsRef()));
-
-    final FST.Arc<T> scratchArc = new FST.Arc<T>();
-    final FST.BytesReader fstReader = fst.getBytesReader(0);
-
-    //System.out.println("fst/a intersect");
-
-    while (queue.size() != 0) {
-      final Path<T> path = queue.remove(queue.size()-1);
-      //System.out.println("  cycle path=" + path);
-      if (path.state.isAccept()) {
-        endNodes.add(path);
-      }
-
-      IntsRef currentInput = path.input;
-      for(Transition t : path.state.getTransitions()) {
-        // TODO: we can fix this if necessary:
-        if (t.getMin() != t.getMax()) {
-          throw new IllegalStateException("can only handle Transitions that match one character");
-        }
-
-        //System.out.println("    t=" + (char) t.getMin());
-
-        final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(), path.fstNode, scratchArc, fstReader);
-        if (nextArc != null) {
-          //System.out.println("      fst matches");
-          // Path continues:
-          IntsRef newInput = new IntsRef(currentInput.length + 1);
-          newInput.copyInts(currentInput);
-          newInput.ints[currentInput.length] = t.getMin();
-          newInput.length = currentInput.length + 1;
-
-          queue.add(new Path<T>(t.getDest(),
-                                new FST.Arc<T>().copyFrom(nextArc),
-                                fst.outputs.add(path.output, nextArc.output),
-                                newInput));
-        }
-      }
-    }
-
-    return endNodes;
-  }
-  
  /**
-   * nocommit javadoc
+   * Enumerates all minimal prefix paths in the automaton that also intersect the FST,
+   * accumulating the FST end node and output for each path.
   */
-  public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst) throws IOException {
+  public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst)
+      throws IOException {
    assert a.isDeterministic();
    final List<Path<T>> queue = new ArrayList<Path<T>>();
    final List<Path<T>> endNodes = new ArrayList<Path<T>>();
@ -135,14 +84,16 @@ public class FSTUtil {
      final Path<T> path = queue.remove(queue.size() - 1);
      if (path.state.isAccept()) {
        endNodes.add(path);
+        // we can stop here if we accept this path,
+        // we accept all further paths too
        continue;
      }
-//      System.out.println(UnicodeUtil.newString(path.input.ints, path.input.offset, path.input.length));
-
+      
      IntsRef currentInput = path.input;
      for (Transition t : path.state.getTransitions()) {
-        
-        if (t.getMin() == t.getMax()) {
+        final int min = t.getMin();
+        final int max = t.getMax();
+        if (min == max) {
          final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(),
              path.fstNode, scratchArc, fstReader);
          if (nextArc != null) {
@ -150,32 +101,26 @@ public class FSTUtil {
            newInput.copyInts(currentInput);
            newInput.ints[currentInput.length] = t.getMin();
            newInput.length = currentInput.length + 1;
-//            if (t.getDest().isAccept()) {
-//              System.out.println(UnicodeUtil.newString(newInput.ints, newInput.offset, newInput.length));              
-//            }
            queue.add(new Path<T>(t.getDest(), new FST.Arc<T>()
                .copyFrom(nextArc), fst.outputs
                .add(path.output, nextArc.output), newInput));
          }
        } else {
-          // TODO: 
+          // TODO:
          // if we accept the entire range possible in the FST (ie. 0 to 256)
          // we can simply use the prefix as the accepted state instead of
          // looking up all the
          // ranges and terminate early here?
-          FST.Arc<T> nextArc = Util.readCeilArc(t.getMin(), fst, path.fstNode,
+          FST.Arc<T> nextArc = Util.readCeilArc(min, fst, path.fstNode,
              scratchArc, fstReader);
-          while (nextArc != null && nextArc.label <= t.getMax()) {
-            assert nextArc.label <= t.getMax();
-            assert nextArc.label >= t.getMin() : nextArc.label + " "
-                + t.getMin();
+          while (nextArc != null && nextArc.label <= max) {
+            assert nextArc.label <=  max;
+            assert nextArc.label >= min : nextArc.label + " "
+                + min;
            final IntsRef newInput = new IntsRef(currentInput.length + 1);
            newInput.copyInts(currentInput);
            newInput.ints[currentInput.length] = nextArc.label;
            newInput.length = currentInput.length + 1;
-//            if (t.getDest().isAccept()) {
-//              System.out.println(UnicodeUtil.newString(newInput.ints, newInput.offset, newInput.length));              
-//            }
            queue.add(new Path<T>(t.getDest(), new FST.Arc<T>()
                .copyFrom(nextArc), fst.outputs
                .add(path.output, nextArc.output), newInput));
@ -188,13 +133,7 @@ public class FSTUtil {
        }
      }
    }
-    //System.out.println();
-    
-    for (Path<T> path2 : endNodes) {
-      if ("poales".equals(UnicodeUtil.newString(path2.input.ints, path2.input.offset, path2.input.length)))
-        System.out.println(UnicodeUtil.newString(path2.input.ints, path2.input.offset, path2.input.length));
-    }
-      return endNodes;
-    }
+    return endNodes;
+  }
  
 }
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
@ -1,23 +1,4 @@
 package org.apache.lucene.search.suggest.analyzing;
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester.PathIntersector;
-import org.apache.lucene.search.suggest.analyzing.FSTUtil.Path;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.BasicAutomata;
-import org.apache.lucene.util.automaton.BasicOperations;
-import org.apache.lucene.util.automaton.LevenshteinAutomata;
-import org.apache.lucene.util.automaton.SpecialOperations;
-import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.PairOutputs.Pair;
-
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
@ -34,24 +15,117 @@ import org.apache.lucene.util.fst.PairOutputs.Pair;
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Set;

-public class FuzzySuggester extends AnalyzingSuggester {
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.search.suggest.analyzing.FSTUtil.Path;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.LevenshteinAutomata;
+import org.apache.lucene.util.automaton.SpecialOperations;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PairOutputs.Pair;
+
+/**
+ * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is
+ * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though
+ * you can explicitly choose classic Levenshtein by passing <code>false</code>
+ * to the <code>transpositions</code> parameter.
+ * <p>
+ * At most, this query will match terms up to
+ * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}
+ * edits. Higher distances (especially with transpositions enabled), are not
+ * supported.
+ * <p>
+ * Note: complex query analyzers can have a significant impact on the lookup
+ * performance. It's recommended to not use analyzers that drop or inject terms
+ * like synonyms to keep the complexity of the prefix intersection low for good
+ * lookup performance. At index time, complex analyzers can safely be used.
+ * </p>
+ */
+public final class FuzzySuggester extends AnalyzingSuggester {
  private final int maxEdits;
  private final boolean transpositions;
  private final int minPrefix;
  
+  /**
+   * The default minimum shared (non-fuzzy) prefix. Set to <tt>2</tt>
+   */
+  public static final int DEFAULT_MIN_PREFIX = 2;
+  
+  /**
+   * The default maximum number of edits for fuzzy suggestions. Set to <tt>1</tt>
+   */
+  public static final int DEFAULT_MAX_EDITS = 1;
+  
+  /**
+   * Creates a {@link FuzzySuggester} instance initialized with default values.
+   * Calls
+   * {@link FuzzySuggester#FuzzySuggester(Analyzer, Analyzer, int, int, int, int, boolean, int)}
+   * FuzzySuggester(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1,
+   * DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX)
+   * 
+   * @param analyzer
+   *          the analyzer used for this suggester
+   */
  public FuzzySuggester(Analyzer analyzer) {
    this(analyzer, analyzer);
  }
  
+  /**
+   * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values.
+   * Calls
+   * {@link FuzzySuggester#FuzzySuggester(Analyzer, Analyzer, int, int, int, int, boolean, int)}
+   * FuzzySuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1,
+   * DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX)
+   * 
+   * @param indexAnalyzer
+   *           Analyzer that will be used for analyzing suggestions while building the index.
+   * @param queryAnalyzer
+   *           Analyzer that will be used for analyzing query text during lookup
+   */
  public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
-    this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, 1, true, 1);
+    this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX);
  }

-  // nocommit: probably want an option to like, require the first character or something :)
+  /**
+   * Creates a {@link FuzzySuggester} instance.
+   * 
+   * @param indexAnalyzer Analyzer that will be used for
+   *        analyzing suggestions while building the index.
+   * @param queryAnalyzer Analyzer that will be used for
+   *        analyzing query text during lookup
+   * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
+   * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
+   *        surface forms to keep for a single analyzed form.
+   *        When there are too many surface forms we discard the
+   *        lowest weighted ones.
+   * @param maxGraphExpansions Maximum number of graph paths
+   *        to expand from the analyzed form.  Set this to -1 for
+   *        no limit.
+   *   
+   * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}.
+   * @param transpositions <code>true</code> if transpositions should be treated as a primitive 
+   *        edit operation. If this is false, comparisons will implement the classic
+   *        Levenshtein algorithm.
+   * @param minPrefix length of common (non-fuzzy) prefix
+   *          
+   */
  public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
      int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, int maxEdits, boolean transpositions, int minPrefix) {
    super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
+    if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
+      throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
+    }
+    if (minPrefix < 0) {
+      throw new IllegalArgumentException("minPrefix must not be < 0");
+    }
    this.maxEdits = maxEdits;
    this.transpositions = transpositions;
    this.minPrefix = minPrefix;
@ -66,8 +140,7 @@ public class FuzzySuggester extends AnalyzingSuggester {
  }

  final Automaton toLevenshteinAutomata(Automaton automaton) {
-    // nocommit: how slow can this be :)
-    Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
+    final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
    Automaton subs[] = new Automaton[ref.size()];
    int upto = 0;
    for (IntsRef path : ref) {
@ -92,7 +165,7 @@ public class FuzzySuggester extends AnalyzingSuggester {
      return subs[0];
    } else {
      Automaton a = BasicOperations.union(Arrays.asList(subs));
-      // nocommit: we could call toLevenshteinAutomata() before det? 
+      // TODO: we could call toLevenshteinAutomata() before det? 
      // this only happens if you have multiple paths anyway (e.g. synonyms)
      BasicOperations.determinize(a);
      return a;
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
@ -48,7 +48,7 @@ import org.junit.Ignore;
 /**
 * Benchmarks tests for implementations of {@link Lookup} interface.
 */
-//@Ignore("COMMENT ME TO RUN BENCHMARKS!")
+@Ignore("COMMENT ME TO RUN BENCHMARKS!")
 public class LookupBenchmarkTest extends LuceneTestCase {
  @SuppressWarnings("unchecked")
  private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
@ -57,6 +57,27 @@ import org.apache.lucene.util.fst.Util;

 public class FuzzySuggesterTest extends LuceneTestCase {
  
+  public void testRandomEdits() throws IOException {
+    List<TermFreq> keys = new ArrayList<TermFreq>();
+    int numTerms = atLeast(100);
+    for (int i = 0; i < numTerms; i++) {
+      keys.add(new TermFreq("boo" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));
+    }
+    keys.add(new TermFreq("foo bar boo far", 12));
+    FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
+    suggester.build(new TermFreqArrayIterator(keys));
+    int numIters = atLeast(10);
+    for (int i = 0; i < numIters; i++) {
+      String addRandomEdit = addRandomEdit("foo bar boo", 2);
+      List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
+      assertEquals(addRandomEdit, 1, results.size());
+      assertEquals("foo bar boo far", results.get(0).key.toString());
+      assertEquals(12, results.get(0).value, 0.01F);  
+    }
+    
+    
+  }
+  
  /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */
  public void testKeyword() throws Exception {
    TermFreq keys[] = new TermFreq[] {
@ -96,12 +117,6 @@ public class FuzzySuggesterTest extends LuceneTestCase {
    assertEquals("barbara", results.get(1).key.toString());
    assertEquals(6, results.get(1).value, 0.01F);
    
-    String addRandomEdit = addRandomEdit("barbara", 1);
-    results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
-    assertEquals(addRandomEdit, 1, results.size());
-    assertEquals("barbara", results.get(0).key.toString());
-    assertEquals(6, results.get(0).value, 0.01F);
-    
    // top N of 2, but only foo is available
    results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2);
    assertEquals(1, results.size());
@ -134,7 +149,6 @@ public class FuzzySuggesterTest extends LuceneTestCase {
    assertEquals(6, results.get(2).value, 0.01F);
  }
  
-  // TODO: more tests
  /**
   * basic "standardanalyzer" test with stopword removal
   */
@ -786,7 +800,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
    assertEquals(50, results.get(1).value);
  }
  
-  public String addRandomEdit(String string, int prefixLenght) {
+  private static String addRandomEdit(String string, int prefixLenght) {
    char[] charArray = string.toCharArray();
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < charArray.length; i++) {
@ -822,22 +836,4 @@ public class FuzzySuggesterTest extends LuceneTestCase {
    }
    return builder.toString();
  }
-  
-  public Automaton getAutomaton(String string) {
-    IntsRef path = new IntsRef();
-    Util.toUTF32(string, path);
-    if (path.length <= 1) {
-      return BasicAutomata.makeString(path.ints, path.offset, path.length);
-    } else {
-      Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, 1);
-      int ints[] = new int[path.length-1-1];
-      System.arraycopy(path.ints, path.offset+1, ints, 0, ints.length);
-      LevenshteinAutomata lev = new LevenshteinAutomata(ints, 256, true);
-      Automaton levAutomaton = lev.toAutomaton(1);
-      Automaton suffix = BasicAutomata.makeString(path.ints, path.length-1, 1);
-      Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton, suffix, BasicAutomata.makeAnyString()));
-      combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
-      return combined;
-    }
-  }
 }
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java
@ -35,8 +35,7 @@ public class TestRuleAssertionsRequired implements TestRule {
          String msg = "Test class requires enabled assertions, enable globally (-ea)" +
              " or for Solr/Lucene subpackages only: " + description.getClassName();
          System.err.println(msg);
-          // nocommit put back:
-          //throw new Exception(msg);
+          throw new Exception(msg);
        } catch (AssertionError e) {
          // Ok, enabled.
        }