LUCENE-3846: add new FuzzySuggester

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1403779 13f79535-47bb-0310-9956-ffa450edef68
2012-10-30 16:47:17 +00:00 · 2012-10-30 16:47:17 +00:00 · d8e44bd09d
parent 472242dc8f 7f7a0058e2
commit d8e44bd09d
11 changed files with 1654 additions and 111 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -49,6 +49,10 @@ New Features
  for better search performance. 
  (Han Jiang, Adrien Grand, Robert Muir, Mike McCandless)
 * LUCENE-3846: New FuzzySuggester, like AnalyzingSuggester except it
  also finds completions allowing for fuzzy edits in the input string.
  (Robert Muir, Simon Willnauer, Mike McCandless)
 API Changes
 * LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
--- a/lucene/common-build.xml
+++ b/lucene/common-build.xml
@ -833,7 +833,7 @@
            <assertions>
              <enable package="org.apache.lucene"/>
              <enable package="org.apache.solr"/>
-            </assertions>
+            </assertions>  
            <!-- JVM arguments and system properties. -->
            <jvmarg line="${args}"/>
--- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
@ -88,6 +89,7 @@ public class TokenStreamToAutomaton {
    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
    final BytesRef term = termBytesAtt.getBytesRef();
    in.reset();
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
@ -240,6 +240,20 @@ final public class BasicAutomata {
    a.deterministic = true;
    return a;
  }
  public static Automaton makeString(int[] word, int offset, int length) {
    Automaton a = new Automaton();
    a.setDeterministic(true);
    State s = new State();
    a.initial = s;
    for (int i = offset; i < offset+length; i++) {
      State s2 = new State();
      s.addTransition(new Transition(word[i], s2));
      s = s2;
    }
    s.accept = true;
    return a;
  }
  /**
   * Returns a new (deterministic and minimal) automaton that accepts the union
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java
@ -33,12 +33,13 @@ public class LevenshteinAutomata {
  /** @lucene.internal */
  public static final int MAXIMUM_SUPPORTED_DISTANCE = 2;
  /* input word */
  final String input;
  final int word[];
  /* the automata alphabet. */
  final int alphabet[];
  /* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */
  final int alphaMax;
-  /* the unicode ranges outside of alphabet */
+  /* the ranges outside of alphabet */
  final int rangeLower[];
  final int rangeUpper[];
  int numRanges = 0;
@ -50,17 +51,26 @@ public class LevenshteinAutomata {
   * Optionally count transpositions as a primitive edit.
   */
  public LevenshteinAutomata(String input, boolean withTranspositions) {
-    this.input = input;
+    this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions);
-    int length = Character.codePointCount(input, 0, input.length());
+  }
-    word = new int[length];
+
-    for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
+  /**
-      word[j++] = cp = input.codePointAt(i);
+   * Expert: specify a custom maximum possible symbol
-    }
+   * (alphaMax); default is Character.MAX_CODE_POINT.
-    
+   */
  public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) {
    this.word = word;
    this.alphaMax = alphaMax;
    // calculate the alphabet
    SortedSet<Integer> set = new TreeSet<Integer>();
-    for (int i = 0; i < word.length; i++)
+    for (int i = 0; i < word.length; i++) {
-      set.add(word[i]);
+      int v = word[i];
      if (v > alphaMax) {
        throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word");
      }
      set.add(v);
    }
    alphabet = new int[set.size()];
    Iterator<Integer> iterator = set.iterator();
    for (int i = 0; i < alphabet.length; i++)
@ -81,9 +91,9 @@ public class LevenshteinAutomata {
      lower = higher + 1;
    }
    /* add the final endpoint */
-    if (lower <= Character.MAX_CODE_POINT) {
+    if (lower <= alphaMax) {
      rangeLower[numRanges] = lower;
-      rangeUpper[numRanges] = Character.MAX_CODE_POINT;
+      rangeUpper[numRanges] = alphaMax;
      numRanges++;
    }
@ -94,6 +104,15 @@ public class LevenshteinAutomata {
    };
  }
  private static int[] codePoints(String input) {
    int length = Character.codePointCount(input, 0, input.length());
    int word[] = new int[length];
    for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
      word[j++] = cp = input.codePointAt(i);
    }
    return word;
  }
  /**
   * Compute a DFA that accepts all strings within an edit distance of <code>n</code>.
   * <p>
@ -106,8 +125,9 @@ public class LevenshteinAutomata {
   * </p>
   */
  public Automaton toAutomaton(int n) {
-    if (n == 0)
+    if (n == 0) {
-      return BasicAutomata.makeString(input);
+      return BasicAutomata.makeString(word, 0, word.length);
    }
    if (n >= descriptions.length)
      return null;
--- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
+++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
@ -22,6 +22,8 @@ import java.util.*;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.fst.FST.Arc;
 import org.apache.lucene.util.fst.FST.BytesReader;
 /** Static helper methods.
 *
@ -304,7 +306,10 @@ public final class Util {
          path.input.ints[path.input.length++] = path.arc.label;
          final int cmp = bottom.input.compareTo(path.input);
          path.input.length--;
          // We should never see dups:
          assert cmp != 0;
          if (cmp < 0) {
            // Doesn't compete
            return;
@ -846,4 +851,93 @@ public final class Util {
    w.close();
  }
  */
  /**
   * Reads the first arc greater or equal that the given label into the provided
   * arc in place and returns it iff found, otherwise return <code>null</code>.
   * 
   * @param label the label to ceil on
   * @param fst the fst to operate on
   * @param follow the arc to follow reading the label from
   * @param arc the arc to read into in place
   * @param in the fst's {@link BytesReader}
   */
  public static <T> Arc<T> readCeilArc(int label, FST<T> fst, Arc<T> follow,
      Arc<T> arc, BytesReader in) throws IOException {
    // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum?
    if (label == FST.END_LABEL) {
      if (follow.isFinal()) {
        if (follow.target <= 0) {
          arc.flags = FST.BIT_LAST_ARC;
        } else {
          arc.flags = 0;
          // NOTE: nextArc is a node (not an address!) in this case:
          arc.nextArc = follow.target;
          arc.node = follow.target;
        }
        arc.output = follow.nextFinalOutput;
        arc.label = FST.END_LABEL;
        return arc;
      } else {
        return null;
      }
    }
    if (!FST.targetHasArcs(follow)) {
      return null;
    }
    fst.readFirstTargetArc(follow, arc, in);
    if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
      // Arcs are fixed array -- use binary search to find
      // the target.
      int low = arc.arcIdx;
      int high = arc.numArcs - 1;
      int mid = 0;
      // System.out.println("do arc array low=" + low + " high=" + high +
      // " targetLabel=" + targetLabel);
      while (low <= high) {
        mid = (low + high) >>> 1;
        in.pos = arc.posArcsStart;
        in.skip(arc.bytesPerArc * mid + 1);
        final int midLabel = fst.readLabel(in);
        final int cmp = midLabel - label;
        // System.out.println("  cycle low=" + low + " high=" + high + " mid=" +
        // mid + " midLabel=" + midLabel + " cmp=" + cmp);
        if (cmp < 0) {
          low = mid + 1;
        } else if (cmp > 0) {
          high = mid - 1;
        } else {
          arc.arcIdx = mid-1;
          return fst.readNextRealArc(arc, in);
        }
      }
      if (low == arc.numArcs) {
        // DEAD END!
        return null;
      }
      arc.arcIdx = (low > high ? high : low);
      return fst.readNextRealArc(arc, in);
    }
    // Linear scan
    fst.readFirstRealTargetArc(follow.target, arc, in);
    while (true) {
      // System.out.println("  non-bs cycle");
      // TODO: we should fix this code to not have to create
      // object for the output of every arc we scan... only
      // for the matching arc, if found
      if (arc.label >= label) {
        // System.out.println("    found!");
        return arc;
      } else if (arc.isLast()) {
        return null;
      } else {
        fst.readNextRealArc(arc, in);
      }
    }
  }
 }
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@ -31,6 +31,7 @@ import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.TokenStreamToAutomaton;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.search.spell.TermFreqIterator;
 import org.apache.lucene.search.suggest.Lookup;
 import org.apache.lucene.search.suggest.fst.Sort;
@ -310,7 +311,7 @@ public class AnalyzingSuggester extends Lookup {
    }
  }
-  private TokenStreamToAutomaton getTokenStreamToAutomaton() {
+  TokenStreamToAutomaton getTokenStreamToAutomaton() {
    if (preserveSep) {
      return new EscapingTokenStreamToAutomaton();
    } else {
@ -332,6 +333,7 @@ public class AnalyzingSuggester extends Lookup {
    BytesRef scratch = new BytesRef();
    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    // analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short) 
    boolean success = false;
    byte buffer[] = new byte[8];
@ -339,29 +341,8 @@ public class AnalyzingSuggester extends Lookup {
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
      BytesRef surfaceForm;
      while ((surfaceForm = iterator.next()) != null) {
-
+        Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
-        // Analyze surface form:
+        
        TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
        // Create corresponding automaton: labels are bytes
        // from each analyzed token, with byte 0 used as
        // separator between tokens:
        Automaton automaton = ts2a.toAutomaton(ts);
        ts.end();
        ts.close();
        replaceSep(automaton);
        assert SpecialOperations.isFinite(automaton);
        // Get all paths from the automaton (there can be
        // more than one path, eg if the analyzer created a
        // graph using SynFilter or WDF):
        // TODO: we could walk & add simultaneously, so we
        // don't have to alloc [possibly biggish]
        // intermediate HashSet in RAM:
        Set<IntsRef> paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
        maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
        for (IntsRef path : paths) {
@ -510,27 +491,10 @@ public class AnalyzingSuggester extends Lookup {
    }
    //System.out.println("lookup key=" + key + " num=" + num);
-
+    final BytesRef utf8Key = new BytesRef(key);
    try {
-      // TODO: is there a Reader from a CharSequence?
+      Automaton lookupAutomaton = toLookupAutomaton(key);
      // Turn tokenstream into automaton:
      TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
      Automaton automaton = getTokenStreamToAutomaton().toAutomaton(ts);
      ts.end();
      ts.close();
      // TODO: we could use the end offset to "guess"
      // whether the final token was a partial token; this
      // would only be a heuristic ... but maybe an OK one.
      // This way we could eg differentiate "net" from "net ",
      // which we can't today...
      replaceSep(automaton);
      // TODO: we can optimize this somewhat by determinizing
      // while we convert
      BasicOperations.determinize(automaton);
      final CharsRef spare = new CharsRef();
@ -538,8 +502,7 @@ public class AnalyzingSuggester extends Lookup {
      // Intersect automaton w/ suggest wFST and get all
      // prefix starting nodes & their outputs:
-      final List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths;
+      //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
      prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst);
      //System.out.println("  prefixPaths: " + prefixPaths.size());
@ -549,6 +512,8 @@ public class AnalyzingSuggester extends Lookup {
      final List<LookupResult> results = new ArrayList<LookupResult>();
      List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);
      if (exactFirst) {
        int count = 0;
@ -593,9 +558,9 @@ public class AnalyzingSuggester extends Lookup {
        // nodes we have and the
        // maxSurfaceFormsPerAnalyzedForm:
        for(MinResult<Pair<Long,BytesRef>> completion : completions) {
-          spare.grow(completion.output.output2.length);
+          if (utf8Key.bytesEquals(completion.output.output2)) {
-          UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
+            spare.grow(completion.output.output2.length);
-          if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
+            UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
            results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1)));
            break;
          }
@ -630,9 +595,7 @@ public class AnalyzingSuggester extends Lookup {
            // In exactFirst mode, don't accept any paths
            // matching the surface form since that will
            // create duplicate results:
-            spare.grow(output.output2.length);
+            if (utf8Key.bytesEquals(output.output2)) {
            UnicodeUtil.UTF8toUTF16(output.output2, spare);
            if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
              // We found exact match, which means we should
              // have already found it in the first search:
              assert results.size() == 1;
@ -644,6 +607,8 @@ public class AnalyzingSuggester extends Lookup {
        }
      };
      prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
      for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
        searcher.addStartPaths(path.fstNode, path.output, true, path.input);
      }
@ -654,6 +619,10 @@ public class AnalyzingSuggester extends Lookup {
        spare.grow(completion.output.output2.length);
        UnicodeUtil.UTF8toUTF16(completion.output.output2, spare);
        LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));
        // TODO: for fuzzy case would be nice to return
        // how many edits were required
        //System.out.println("    result=" + result);
        results.add(result);
@ -670,6 +639,63 @@ public class AnalyzingSuggester extends Lookup {
    }
  }
  /** Returns all prefix paths to initialize the search. */
  protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                       Automaton lookupAutomaton,
                                                                       FST<Pair<Long,BytesRef>> fst)
    throws IOException {
    return prefixPaths;
  }
  final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
 // Analyze surface form:
    TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString()));
    // Create corresponding automaton: labels are bytes
    // from each analyzed token, with byte 0 used as
    // separator between tokens:
    Automaton automaton = ts2a.toAutomaton(ts);
    ts.end();
    ts.close();
    replaceSep(automaton);
    assert SpecialOperations.isFinite(automaton);
    // Get all paths from the automaton (there can be
    // more than one path, eg if the analyzer created a
    // graph using SynFilter or WDF):
    // TODO: we could walk & add simultaneously, so we
    // don't have to alloc [possibly biggish]
    // intermediate HashSet in RAM:
    return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
  }
  final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
    Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
    ts.end();
    ts.close();
    // TODO: we could use the end offset to "guess"
    // whether the final token was a partial token; this
    // would only be a heuristic ... but maybe an OK one.
    // This way we could eg differentiate "net" from "net ",
    // which we can't today...
    replaceSep(automaton);
    // TODO: we can optimize this somewhat by determinizing
    // while we convert
    BasicOperations.determinize(automaton);
    return automaton;
  }
  /**
   * Returns the weight associated with an input string,
   * or null if it does not exist.
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
@ -26,6 +26,7 @@ import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.State;
 import org.apache.lucene.util.automaton.Transition;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.Util;
 // TODO: move to core?  nobody else uses it yet though...
@ -62,57 +63,78 @@ public class FSTUtil {
    }
  }
-  /** Enumerates all paths in the automaton that also
+  /**
-   *  intersect the FST, accumulating the FST end node and
+   * Enumerates all minimal prefix paths in the automaton that also intersect the FST,
-   *  output for each path. */
+   * accumulating the FST end node and output for each path.
-  public static<T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst) throws IOException {
+   */
  public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst)
      throws IOException {
    assert a.isDeterministic();
    final List<Path<T>> queue = new ArrayList<Path<T>>();
    final List<Path<T>> endNodes = new ArrayList<Path<T>>();
-
+    queue.add(new Path<T>(a.getInitialState(), fst
-    queue.add(new Path<T>(a.getInitialState(),
+        .getFirstArc(new FST.Arc<T>()), fst.outputs.getNoOutput(),
-                          fst.getFirstArc(new FST.Arc<T>()),       
+        new IntsRef()));
-                          fst.outputs.getNoOutput(),
+    
                          new IntsRef()));
    final FST.Arc<T> scratchArc = new FST.Arc<T>();
    final FST.BytesReader fstReader = fst.getBytesReader(0);
-
+    
    //System.out.println("fst/a intersect");
    while (queue.size() != 0) {
-      final Path<T> path = queue.remove(queue.size()-1);
+      final Path<T> path = queue.remove(queue.size() - 1);
      //System.out.println("  cycle path=" + path);
      if (path.state.isAccept()) {
        endNodes.add(path);
        // we can stop here if we accept this path,
        // we accept all further paths too
        continue;
      }
-
+      
      IntsRef currentInput = path.input;
-      for(Transition t : path.state.getTransitions()) {
+      for (Transition t : path.state.getTransitions()) {
-        
+        final int min = t.getMin();
-        // TODO: we can fix this if necessary:
+        final int max = t.getMax();
-        if (t.getMin() != t.getMax()) {
+        if (min == max) {
-          throw new IllegalStateException("can only handle Transitions that match one character");
+          final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(),
-        }
+              path.fstNode, scratchArc, fstReader);
-
+          if (nextArc != null) {
-        //System.out.println("    t=" + (char) t.getMin());
+            final IntsRef newInput = new IntsRef(currentInput.length + 1);
-
+            newInput.copyInts(currentInput);
-        final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(), path.fstNode, scratchArc, fstReader);
+            newInput.ints[currentInput.length] = t.getMin();
-        if (nextArc != null) {
+            newInput.length = currentInput.length + 1;
-          //System.out.println("      fst matches");
+            queue.add(new Path<T>(t.getDest(), new FST.Arc<T>()
-          // Path continues:
+                .copyFrom(nextArc), fst.outputs
-          IntsRef newInput = new IntsRef(currentInput.length + 1);
+                .add(path.output, nextArc.output), newInput));
-          newInput.copyInts(currentInput);
+          }
-          newInput.ints[currentInput.length] = t.getMin();
+        } else {
-          newInput.length = currentInput.length + 1;
+          // TODO: if this transition's TO state is accepting, and
-
+          // it accepts the entire range possible in the FST (ie. 0 to 255),
-          queue.add(new Path<T>(t.getDest(),
+          // we can simply use the prefix as the accepted state instead of
-                                new FST.Arc<T>().copyFrom(nextArc),
+          // looking up all the ranges and terminate early
-                                fst.outputs.add(path.output, nextArc.output),
+          // here.  This just shifts the work from one queue
-                                newInput));
+          // (this one) to another (the completion search
          // done in AnalyzingSuggester).
          FST.Arc<T> nextArc = Util.readCeilArc(min, fst, path.fstNode,
              scratchArc, fstReader);
          while (nextArc != null && nextArc.label <= max) {
            assert nextArc.label <=  max;
            assert nextArc.label >= min : nextArc.label + " "
                + min;
            final IntsRef newInput = new IntsRef(currentInput.length + 1);
            newInput.copyInts(currentInput);
            newInput.ints[currentInput.length] = nextArc.label;
            newInput.length = currentInput.length + 1;
            queue.add(new Path<T>(t.getDest(), new FST.Arc<T>()
                .copyFrom(nextArc), fst.outputs
                .add(path.output, nextArc.output), newInput));
            final int label = nextArc.label; // used in assert
            nextArc = nextArc.isLast() ? null : fst.readNextRealArc(nextArc,
                fstReader);
            assert nextArc == null || label < nextArc.label : "last: " + label
                + " next: " + nextArc.label;
          }
        }
      }
    }
    return endNodes;
  }
 }
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java
@ -0,0 +1,226 @@
 package org.apache.lucene.search.suggest.analyzing;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.BasicAutomata;
 import org.apache.lucene.util.automaton.BasicOperations;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;
 import org.apache.lucene.util.automaton.SpecialOperations;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PairOutputs.Pair;
 /**
 * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is
 * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though
 * you can explicitly choose classic Levenshtein by passing <code>false</code>
 * for the <code>transpositions</code> parameter.
 * <p>
 * At most, this query will match terms up to
 * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}
 * edits. Higher distances are not supported.  Note that the
 * fuzzy distance is measured in "byte space" on the bytes
 * returned by the {@link TokenStream}'s {@link
 * TermToBytesRefAttribute}, usually UTF8.  By default
 * the analyzed bytes must be at least 3 {@link
 * #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are
 * considered.  Furthermore, the first 1 {@link
 * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be
 * edited.  We allow up to 1 (@link
 * #DEFAULT_MAX_EDITS} edit.
 *
 * <p>
 * NOTE: This suggester does not boost suggestions that
 * required no edits over suggestions that did require
 * edits.  This is a known limitation.
 *
 * <p>
 * Note: complex query analyzers can have a significant impact on the lookup
 * performance. It's recommended to not use analyzers that drop or inject terms
 * like synonyms to keep the complexity of the prefix intersection low for good
 * lookup performance. At index time, complex analyzers can safely be used.
 * </p>
 */
 public final class FuzzySuggester extends AnalyzingSuggester {
  private final int maxEdits;
  private final boolean transpositions;
  private final int nonFuzzyPrefix;
  private final int minFuzzyLength;
  /**
   * The default minimum length of the key passed to {@link
   * #lookup} before any edits are allowed.
   */
  public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;
  /**
   * The default prefix length where edits are not allowed.
   */
  public static final int DEFAULT_NON_FUZZY_PREFIX = 1;
  /**
   * The default maximum number of edits for fuzzy
   * suggestions.
   */
  public static final int DEFAULT_MAX_EDITS = 1;
  /**
   * Creates a {@link FuzzySuggester} instance initialized with default values.
   * 
   * @param analyzer the analyzer used for this suggester
   */
  public FuzzySuggester(Analyzer analyzer) {
    this(analyzer, analyzer);
  }
  /**
   * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values.
   * 
   * @param indexAnalyzer
   *           Analyzer that will be used for analyzing suggestions while building the index.
   * @param queryAnalyzer
   *           Analyzer that will be used for analyzing query text during lookup
   */
  public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
    this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true,
         DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH);
  }
  /**
   * Creates a {@link FuzzySuggester} instance.
   * 
   * @param indexAnalyzer Analyzer that will be used for
   *        analyzing suggestions while building the index.
   * @param queryAnalyzer Analyzer that will be used for
   *        analyzing query text during lookup
   * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
   * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
   *        surface forms to keep for a single analyzed form.
   *        When there are too many surface forms we discard the
   *        lowest weighted ones.
   * @param maxGraphExpansions Maximum number of graph paths
   *        to expand from the analyzed form.  Set this to -1 for
   *        no limit.
   * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .
   * @param transpositions <code>true</code> if transpositions should be treated as a primitive 
   *        edit operation. If this is false, comparisons will implement the classic
   *        Levenshtein algorithm.
   * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
   * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
   */
  public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
                        int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
                        int maxEdits, boolean transpositions, int nonFuzzyPrefix,
                        int minFuzzyLength) {
    super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
    if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
      throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
    }
    if (nonFuzzyPrefix < 0) {
      throw new IllegalArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")");
    }
    if (minFuzzyLength < 0) {
      throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")");
    }
    this.maxEdits = maxEdits;
    this.transpositions = transpositions;
    this.nonFuzzyPrefix = nonFuzzyPrefix;
    this.minFuzzyLength = minFuzzyLength;
  }
  @Override
  protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                       Automaton lookupAutomaton,
                                                                       FST<Pair<Long,BytesRef>> fst)
    throws IOException {
    // TODO: right now there's no penalty for fuzzy/edits,
    // ie a completion whose prefix matched exactly what the
    // user typed gets no boost over completions that
    // required an edit, which get no boost over completions
    // requiring two edits.  I suspect a multiplicative
    // factor is appropriate (eg, say a fuzzy match must be at
    // least 2X better weight than the non-fuzzy match to
    // "compete") ... in which case I think the wFST needs
    // to be log weights or something ...
    Automaton levA = toLevenshteinAutomata(lookupAutomaton);
    /*
      Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
      w.write(levA.toDot());
      w.close();
      System.out.println("Wrote LevA to out.dot");
    */
    return FSTUtil.intersectPrefixPaths(levA, fst);
  }
  Automaton toLevenshteinAutomata(Automaton automaton) {
    final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
    Automaton subs[] = new Automaton[ref.size()];
    int upto = 0;
    for (IntsRef path : ref) {
      if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {
        subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);
        upto++;
      } else {
        Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);
        int ints[] = new int[path.length-nonFuzzyPrefix];
        System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);
        // TODO: maybe add alphaMin to LevenshteinAutomata,
        // and pass 1 instead of 0?  We probably don't want
        // to allow the trailing dedup bytes to be
        // edited... but then 0 byte is "in general" allowed
        // on input (but not in UTF8).
        LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
        Automaton levAutomaton = lev.toAutomaton(maxEdits);
        Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
        combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
        subs[upto] = combined;
        upto++;
      }
    }
    if (subs.length == 0) {
      return BasicAutomata.makeEmpty(); // matches nothing
    } else if (subs.length == 1) {
      return subs[0];
    } else {
      Automaton a = BasicOperations.union(Arrays.asList(subs));
      // TODO: we could call toLevenshteinAutomata() before det? 
      // this only happens if you have multiple paths anyway (e.g. synonyms)
      BasicOperations.determinize(a);
      // Does not seem to help (and hurt maybe a bit: 6-9
      // prefix went from 19 to 18 kQPS):
      // a.reduce();
      return a;
    }
  }
 }
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
@ -36,6 +36,7 @@ import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.search.suggest.Lookup; // javadocs
 import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
 import org.apache.lucene.search.suggest.analyzing.FuzzySuggester;
 import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
 import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup;
 import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
@ -51,17 +52,20 @@ import org.junit.Ignore;
 public class LookupBenchmarkTest extends LuceneTestCase {
  @SuppressWarnings("unchecked")
  private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(
      FuzzySuggester.class,
      AnalyzingSuggester.class,
      JaspellLookup.class, 
      TSTLookup.class,
      FSTCompletionLookup.class,
-      WFSTCompletionLookup.class,
+      WFSTCompletionLookup.class
-      AnalyzingSuggester.class);
+      
      );
  private final static int rounds = 15;
  private final static int warmup = 5;
  private final int num = 7;
-  private final boolean onlyMorePopular = true;
+  private final boolean onlyMorePopular = false;
  private final static Random random = new Random(0xdeadbeef);
@ -212,8 +216,9 @@ public class LookupBenchmarkTest extends LuceneTestCase {
      final List<String> input = new ArrayList<String>(benchmarkInput.size());
      for (TermFreq tf : benchmarkInput) {
        String s = tf.term.utf8ToString();
-        input.add(s.substring(0, Math.min(s.length(), 
+        String sub = s.substring(0, Math.min(s.length(), 
-              minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1))));
+            minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)));
        input.add(sub);
      }
      BenchmarkResult result = measure(new Callable<Integer>() {
@ -250,7 +255,9 @@ public class LookupBenchmarkTest extends LuceneTestCase {
      }
      return new BenchmarkResult(times, warmup, rounds);
    } catch (Exception e) {
      e.printStackTrace();
      throw new RuntimeException(e);
    }
  }
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java