SOLR-2378: A new, automaton-based, implementation of suggest (autocomplete)

component, offering an order of magnitude smaller memory consumption compared to ternary trees and jaspell and very fast lookups at runtime. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1092136 13f79535-47bb-0310-9956-ffa450edef68
2011-04-14 11:16:43 +00:00 · 2011-04-14 11:16:43 +00:00 · 191706df70
parent 2133423e2b
commit 191706df70
17 changed files with 51252 additions and 191 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -60,6 +60,11 @@ Detailed Change List

 New Features
 ----------------------
+
+* SOLR-2378: A new, automaton-based, implementation of suggest (autocomplete) 
+  component, offering an order of magnitude smaller memory consumption
+  compared to ternary trees and jaspell and very fast lookups at runtime.
+  (Dawid Weiss)
  
 * SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now 
  supports "percentages" which get evaluated  relative the current size of 
--- a/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
+++ b/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
@ -162,7 +162,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar

      } else {
        throw new SolrException(SolrException.ErrorCode.NOT_FOUND,
-            "Specified dictionary does not exist.");
+            "Specified dictionary does not exist: " + getDictionaryName(params));
      }
    }
  }
--- a/solr/src/java/org/apache/solr/spelling/suggest/Lookup.java
+++ b/solr/src/java/org/apache/solr/spelling/suggest/Lookup.java
@ -12,7 +12,6 @@ import org.apache.solr.core.SolrCore;
 import org.apache.solr.util.TermFreqIterator;

 public abstract class Lookup {
-  
  /**
   * Result of a lookup.
   */
--- a/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookup.java
+++ b/solr/src/java/org/apache/solr/spelling/suggest/fst/FSTLookup.java
@ -0,0 +1,556 @@
+package org.apache.solr.spelling.suggest.fst;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.automaton.fst.Builder;
+import org.apache.lucene.util.automaton.fst.FST;
+import org.apache.lucene.util.automaton.fst.FST.Arc;
+import org.apache.lucene.util.automaton.fst.NoOutputs;
+import org.apache.lucene.util.automaton.fst.Outputs;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.spelling.suggest.Lookup;
+import org.apache.solr.spelling.suggest.tst.TSTLookup;
+import org.apache.solr.util.TermFreqIterator;
+
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
+
+/**
+ * Finite state automata based implementation of {@link Lookup} query 
+ * suggestion/ autocomplete interface.
+ * 
+ * <h2>Implementation details</h2> 
+ * 
+ * <p>The construction step in {@link #build(TermFreqIterator)} works as follows:
+ * <ul>
+ * <li>A set of input terms (String) and weights (float) is given.</li>
+ * <li>The range of weights is determined and then all weights are discretized into a fixed set 
+ * of values ({@link #buckets}).
+ * Note that this means that minor changes in weights may be lost during automaton construction. 
+ * In general, this is not a big problem because the "priorities" of completions can be split
+ * into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
+ * If you need exact, fine-grained weights, use {@link TSTLookup} instead.<li>
+ * <li>All terms in the input are preprended with a synthetic pseudo-character being the weight
+ * of that term. For example a term <code>abc</code> with a discretized weight equal '1' would
+ * become <code>1abc</code>.</li> 
+ * <li>The terms are sorted by their raw value of utf16 character values (including the synthetic
+ * term in front).</li>
+ * <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
+ * arcs labeled with all possible weights. We cache all these arcs, highest-weight first.</li>   
+ * </ul>
+ * 
+ * <p>At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
+ * <ul>
+ * <li>For each possible term weight encoded in the automaton (cached arcs from the root above), 
+ * starting with the highest one, we descend along the path of the input key. If the key is not
+ * a prefix of a sequence in the automaton (path ends prematurely), we exit immediately. 
+ * No completions.
+ * <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire
+ * subautomaton (all paths) starting from this node form the key's completions.</b> We start
+ * the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
+ * suggestion to the list of results (the weight of this suggestion is constant and equal to the
+ * root path we started from). The tricky part is that because automaton edges are sorted and
+ * we scan depth-first, we can terminate the entire procedure as soon as we collect enough 
+ * suggestions the user requested.
+ * <li>In case the number of suggestions collected in the step above is still insufficient,
+ * we proceed to the next (smaller) weight leaving the root node and repeat the same 
+ * algorithm again. 
+ * </li>
+ * </ul>
+ *  
+ * <h2>Runtime behavior and performance characteristic</h2>
+ * 
+ * <p>The algorithm described above is optimized for finding suggestions to short prefixes
+ * in a top-weights-first order. This is probably the most common use case: it allows 
+ * presenting suggestions early and sorts them by the global frequency (and then alphabetically).
+ * 
+ * <p>If there is an exact match in the automaton, it is returned first on the results
+ * list (even with by-weight sorting).
+ * 
+ * <p>Note that the maximum lookup time for <b>any prefix</b>
+ * is the time of descending to the subtree, plus traversal of the subtree up to the number
+ * of requested suggestions (because they are already presorted by weight on the root level
+ * and alphabetically at any node level).
+ * 
+ * <p>To order alphabetically only (no ordering by priorities), use identical term weights 
+ * for all terms. Alphabetical suggestions are returned even if non-constant weights are
+ * used, but the algorithm for doing this is suboptimal.  
+ * 
+ * <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order, 
+ * nothing else.
+ */
+public class FSTLookup extends Lookup {
+  /** A structure for a single entry (for sorting/ preprocessing). */
+  private static class Entry {
+    char [] term;
+    float weight;
+
+    public Entry(char [] term, float freq) {
+      this.term = term;
+      this.weight = freq;
+    }
+  }
+
+  /**
+   * The number of separate buckets for weights (discretization). The more buckets,
+   * the more fine-grained term weights (priorities) can be assigned. The speed of lookup
+   * will not decrease for prefixes which have highly-weighted completions (because these
+   * are filled-in first), but will decrease significantly for low-weighted terms (but
+   * these should be infrequent, so it is all right).
+   * 
+   * <p>The number of buckets must be within [1, 255] range.
+   */
+  public static final String WEIGHT_BUCKETS = "weightBuckets";
+
+  /**
+   * If <code>true</code>, exact suggestions are returned first, even if they are prefixes
+   * of other strings in the automaton (possibly with larger weights). 
+   */
+  public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
+
+  /** Serialized automaton file name (storage). */
+  public static final String FILENAME = "fst.dat";
+
+  /** An empty result. */
+  private static final List<LookupResult> EMPTY_RESULT = Lists.newArrayList();
+
+  /**
+   * @see #WEIGHT_BUCKETS
+   */
+  private int buckets = 10;
+
+  /**
+   * #see #EXACT_MATCH_FIRST
+   */
+  private boolean exactMatchFirst = true;
+
+  /**
+   * Finite state automaton encoding all the lookup terms. See class
+   * notes for details.
+   */
+  private FST<Object> automaton;
+
+  /**
+   * An array of arcs leaving the root automaton state and encoding weights of all
+   * completions in their sub-trees.
+   */
+  private Arc<Object> [] rootArcs;
+
+  /* */
+  @Override
+  @SuppressWarnings("rawtypes")
+  public void init(NamedList config, SolrCore core) {
+    this.buckets = config.get(WEIGHT_BUCKETS) != null
+      ? Integer.parseInt(config.get(WEIGHT_BUCKETS).toString())
+      : 10;
+
+    this.exactMatchFirst = config.get(EXACT_MATCH_FIRST) != null
+      ? Boolean.valueOf(config.get(EXACT_MATCH_FIRST).toString())
+      : true;
+  }
+
+  /* */
+  @Override
+  public void build(TermFreqIterator tfit) throws IOException {
+    // Buffer the input because we will need it twice: for calculating
+    // weights distribution and for the actual automata building.
+    List<Entry> entries = Lists.newArrayList();
+    while (tfit.hasNext()) {
+      String term = tfit.next();
+      char [] termChars = new char [term.length() + 1]; // add padding for weight.
+      for (int i = 0; i < term.length(); i++)
+        termChars[i + 1] = term.charAt(i);
+      entries.add(new Entry(termChars, tfit.freq()));
+    }
+
+    // Distribute weights into at most N buckets. This is a form of discretization to
+    // limit the number of possible weights so that they can be efficiently encoded in the
+    // automaton.
+    //
+    // It is assumed the distribution of weights is _linear_ so proportional division 
+    // of [min, max] range will be enough here. Other approaches could be to sort 
+    // weights and divide into proportional ranges.
+    if (entries.size() > 0) {
+      redistributeWeightsProportionalMinMax(entries, buckets);
+      encodeWeightPrefix(entries);
+    }
+
+    // Build the automaton (includes input sorting) and cache root arcs in order from the highest,
+    // to the lowest weight.
+    this.automaton = buildAutomaton(entries);
+    cacheRootArcs();
+  }
+
+  /**
+   * Cache the root node's output arcs starting with completions with the highest weights.
+   */
+  @SuppressWarnings("unchecked")
+  private void cacheRootArcs() throws IOException {
+    if (automaton != null) {
+      List<Arc<Object>> rootArcs = Lists.newArrayList();
+      Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
+      automaton.readFirstTargetArc(arc, arc);
+      while (true) {
+        rootArcs.add(new Arc<Object>().copyFrom(arc));
+        if (arc.isLast())
+          break;
+        automaton.readNextArc(arc);
+      }
+
+      Collections.reverse(rootArcs); // we want highest weights first.
+      this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]);
+    }    
+  }
+
+  /**
+   * Not implemented.
+   */
+  @Override
+  public boolean add(String key, Object value) {
+    // This implementation does not support ad-hoc additions (all input
+    // must be sorted for the builder).
+    return false;
+  }
+
+  /**
+   * Get the (approximated) weight of a single key (if there is a perfect match
+   * for it in the automaton). 
+   * 
+   * @return Returns the approximated weight of the input key or <code>null</code>
+   * if not found.
+   */
+  @Override
+  public Float get(String key) {
+    return getExactMatchStartingFromRootArc(0, key);
+  }
+
+  /**
+   * Returns the first exact match by traversing root arcs, starting from 
+   * the arc <code>i</code>.
+   * 
+   * @param i The first root arc index in {@link #rootArcs} to consider when
+   * matching. 
+   */
+  private Float getExactMatchStartingFromRootArc(int i, String key) {
+    // Get the UTF-8 bytes representation of the input key. 
+    try {
+      final FST.Arc<Object> scratch = new FST.Arc<Object>();
+      for (; i < rootArcs.length; i++) {
+        final FST.Arc<Object> rootArc = rootArcs[i];
+        final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
+
+        // Descend into the automaton using the key as prefix.
+        if (descendWithPrefix(arc, key)) {
+          automaton.readFirstTargetArc(arc, arc);
+          if (arc.label == FST.END_LABEL) {
+            // Prefix-encoded weight.
+            return rootArc.label / (float) buckets;
+          }
+        }
+      }
+    } catch (IOException e) {
+      // Should never happen, but anyway.
+      throw new RuntimeException(e);
+    }
+    
+    return null;
+  }
+
+  /**
+   * Lookup autocomplete suggestions to <code>key</code>.
+   *  
+   * @param key The prefix to which suggestions should be sought. 
+   * @param onlyMorePopular Return most popular suggestions first. This is the default
+   * behavior for this implementation. Setting it to <code>false</code> has no effect (use
+   * constant term weights to sort alphabetically only). 
+   * @param num At most this number of suggestions will be returned.
+   * @return Returns the suggestions, sorted by their approximated weight first (decreasing)
+   * and then alphabetically (utf16 codepoint order).
+   */
+  @Override
+  public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) {
+    if (key.length() == 0 || automaton == null) {
+      // Keep the result an ArrayList to keep calls monomorphic.
+      return EMPTY_RESULT; 
+    }
+    
+    try {
+      if (!onlyMorePopular && rootArcs.length > 1) {
+        // We could emit a warning here (?). An optimal strategy for alphabetically sorted
+        // suggestions would be to add them with a constant weight -- this saves unnecessary
+        // traversals and sorting.
+        return lookupSortedAlphabetically(key, num);
+      } else {
+        return lookupSortedByWeight(key, num, true);
+      }
+    } catch (IOException e) {
+      // Should never happen, but anyway.
+      throw new RuntimeException(e);
+    }
+  }
+
+  /**
+   * Lookup suggestions sorted alphabetically <b>if weights are not constant</b>. This
+   * is a workaround: in general, use constant weights for alphabetically sorted result.
+   */
+  private List<LookupResult> lookupSortedAlphabetically(String key, int num) throws IOException {
+    // Greedily get num results from each weight branch.
+    List<LookupResult> res = lookupSortedByWeight(key, num, false);
+    
+    // Sort and trim.
+    Collections.sort(res, new Comparator<LookupResult>() {
+      @Override
+      public int compare(LookupResult o1, LookupResult o2) {
+        return o1.key.compareTo(o2.key);
+      }
+    });
+    if (res.size() > num) {
+      res = res.subList(0, num);
+    }
+    return res;
+  }
+
+  /**
+   * Lookup suggestions sorted by weight (descending order).
+   * 
+   * @param greedy If <code>true</code>, the routine terminates immediately when <code>num</code>
+   * suggestions have been collected. If <code>false</code>, it will collect suggestions from
+   * all weight arcs (needed for {@link #lookupSortedAlphabetically}.
+   */
+  private ArrayList<LookupResult> lookupSortedByWeight(String key, int num, boolean greedy) throws IOException {
+    final ArrayList<LookupResult> res = new ArrayList<LookupResult>(Math.min(10, num));
+    final StringBuilder output = new StringBuilder(key);
+    final int matchLength = key.length() - 1;
+    
+    for (int i = 0; i < rootArcs.length; i++) {
+      final FST.Arc<Object> rootArc = rootArcs[i];
+      final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
+
+      // Descend into the automaton using the key as prefix.
+      if (descendWithPrefix(arc, key)) {
+        // Prefix-encoded weight.
+        final float weight = rootArc.label / (float) buckets;
+
+        // A subgraph starting from the current node has the completions 
+        // of the key prefix. The arc we're at is the last key's byte,
+        // so we will collect it too.
+        output.setLength(matchLength);
+        if (collect(res, num, weight, output, arc) && greedy) {
+          // We have enough suggestion to return immediately. Keep on looking for an
+          // exact match, if requested.
+          if (exactMatchFirst) {
+            Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key);
+            if (exactMatchWeight != null) {
+              res.add(0, new LookupResult(key, exactMatchWeight));
+              while (res.size() > num) {
+                res.remove(res.size() - 1);
+              }
+            }
+          }
+          break;
+        }
+      }
+    }
+    return res;
+  }
+
+  /**
+   * Descend along the path starting at <code>arc</code> and going through
+   * bytes in <code>utf8</code> argument.
+   *  
+   * @param arc The starting arc. This argument is modified in-place.
+   * @param term The term to descend with.
+   * @return If <code>true</code>, <code>arc</code> will be set to the arc matching
+   * last byte of <code>utf8</code>. <code>false</code> is returned if no such 
+   * prefix <code>utf8</code> exists.
+   */
+  private boolean descendWithPrefix(Arc<Object> arc, String term) throws IOException {
+    final int max = term.length();
+
+    for (int i = 0; i < max; i++) {
+      if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) {
+        // No matching prefixes, return an empty result.
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * Recursive collect lookup results from the automaton subgraph starting at <code>arc</code>.
+   * 
+   * @param num Maximum number of results needed (early termination).
+   * @param weight Weight of all results found during this collection.
+   */
+  private boolean collect(List<LookupResult> res, int num, float weight, StringBuilder output, Arc<Object> arc) throws IOException {
+    output.append((char) arc.label);
+
+    automaton.readFirstTargetArc(arc, arc);
+    while (true) {
+      if (arc.label == FST.END_LABEL) {
+        res.add(new LookupResult(output.toString(), weight));
+        if (res.size() >= num)
+          return true;
+      } else {
+        int save = output.length();
+        if (collect(res, num, weight, output, new Arc<Object>().copyFrom(arc))) {
+          return true;
+        }
+        output.setLength(save);
+      }
+
+      if (arc.isLast()) {
+        break;
+      }
+      automaton.readNextArc(arc);        
+    }
+    return false;
+  }
+
+  /**
+   * Builds the final automaton from a list of entries. 
+   */
+  private FST<Object> buildAutomaton(List<Entry> entries) throws IOException {
+    if (entries.size() == 0)
+      return null;
+    
+    // Sort by utf16 (raw char value)
+    final Comparator<Entry> comp = new Comparator<Entry>() {
+      public int compare(Entry o1, Entry o2) {
+        char [] ch1 = o1.term;
+        char [] ch2 = o2.term;
+        int len1 = ch1.length;
+        int len2 = ch2.length;
+
+        int max = Math.min(len1, len2);
+        for (int i = 0; i < max; i++) {
+          int v = ch1[i] - ch2[i];
+          if (v != 0) return v;
+        }
+        return len1 - len2;
+      }
+    };
+    Collections.sort(entries, comp);
+
+    // Avoid duplicated identical entries, if possible. This is required because
+    // it breaks automaton construction otherwise.
+    int len = entries.size();
+    int j = 0;
+    for (int i = 1; i < len; i++) {
+      if (comp.compare(entries.get(j), entries.get(i)) != 0) {
+        entries.set(++j, entries.get(i));
+      }
+    }
+    entries = entries.subList(0, j + 1);
+
+    // Build the automaton.
+    final Outputs<Object> outputs = NoOutputs.getSingleton();
+    final Object empty = outputs.getNoOutput();
+    final Builder<Object> builder = 
+      new Builder<Object>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
+    final IntsRef scratchIntsRef = new IntsRef(10);
+    for (Entry e : entries) {
+      final int termLength = scratchIntsRef.length = e.term.length;
+
+      scratchIntsRef.grow(termLength);
+      final int [] ints = scratchIntsRef.ints;
+      final char [] chars = e.term;
+      for (int i = termLength; --i >= 0;) {
+        ints[i] = chars[i];
+      }
+      builder.add(scratchIntsRef, empty);
+    }
+    return builder.finish();
+  }
+
+  /**
+   * Prepends the entry's weight to each entry, encoded as a single byte, so that the
+   * root automaton node fans out to all possible priorities, starting with the arc that has
+   * the highest weights.     
+   */
+  private void encodeWeightPrefix(List<Entry> entries) {
+    for (Entry e : entries) {
+      int weight = (int) e.weight;
+      assert (weight >= 0 && weight <= buckets) : 
+        "Weight out of range: " + weight + " [" + buckets + "]";
+  
+      // There should be a single empty char reserved in front for the weight.
+      e.term[0] = (char) weight;
+    }
+  }
+
+  /**
+   *  Split [min, max] range into buckets, reassigning weights. Entries' weights are
+   *  remapped to [0, buckets] range (so, buckets + 1 buckets, actually).
+   */
+  private void redistributeWeightsProportionalMinMax(List<Entry> entries, int buckets) {
+    float min = entries.get(0).weight;
+    float max = min;
+    for (Entry e : entries) {
+      min = Math.min(e.weight, min);
+      max = Math.max(e.weight, max);
+    }
+  
+    final float range = max - min;
+    for (Entry e : entries) {
+      e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor()
+    }
+  }
+
+  /**
+   * Deserialization from disk.
+   */
+  @Override
+  public synchronized boolean load(File storeDir) throws IOException {
+    File data = new File(storeDir, FILENAME);
+    if (!data.exists() || !data.canRead()) {
+      return false;
+    }
+
+    InputStream is = new BufferedInputStream(new FileInputStream(data));
+    try {
+      this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton());
+      cacheRootArcs();
+    } finally {
+      Closeables.closeQuietly(is);
+    }
+    return true;
+  }
+
+  /**
+   * Serialization to disk.
+   */
+  @Override
+  public synchronized boolean store(File storeDir) throws IOException {
+    if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
+      return false;
+    }
+
+    if (this.automaton == null)
+      return false;
+
+    File data = new File(storeDir, FILENAME);
+    OutputStream os = new BufferedOutputStream(new FileOutputStream(data));
+    try {
+      this.automaton.save(new OutputStreamDataOutput(os));
+    } finally {
+      Closeables.closeQuietly(os);
+    }
+
+    return true;
+  }
+}
--- a/solr/src/java/org/apache/solr/spelling/suggest/fst/InputStreamDataInput.java
+++ b/solr/src/java/org/apache/solr/spelling/suggest/fst/InputStreamDataInput.java
@ -0,0 +1,31 @@
+package org.apache.solr.spelling.suggest.fst;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import org.apache.lucene.store.DataInput;
+import com.google.common.io.ByteStreams;
+
+/**
+ * A {@link DataInput} wrapping a plain {@link InputStream}.
+ */
+public class InputStreamDataInput extends DataInput {
+  
+  private final InputStream is;
+
+  public InputStreamDataInput(InputStream is) {
+    this.is = is;
+  }
+  
+  @Override
+  public byte readByte() throws IOException {
+    int v = is.read();
+    if (v == -1) throw new EOFException();
+    return (byte) v;
+  }
+
+  @Override
+  public void readBytes(byte[] b, int offset, int len) throws IOException {
+    ByteStreams.readFully(is, b, offset, len);
+  }
+}
--- a/solr/src/java/org/apache/solr/spelling/suggest/fst/OutputStreamDataOutput.java
+++ b/solr/src/java/org/apache/solr/spelling/suggest/fst/OutputStreamDataOutput.java
@ -0,0 +1,28 @@
+package org.apache.solr.spelling.suggest.fst;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+import org.apache.lucene.store.DataOutput;
+
+/**
+ * A {@link DataOutput} wrapping a plain {@link OutputStream}.
+ */
+public class OutputStreamDataOutput extends DataOutput {
+  
+  private final OutputStream os;
+  
+  public OutputStreamDataOutput(OutputStream os) {
+    this.os = os;
+  }
+  
+  @Override
+  public void writeByte(byte b) throws IOException {
+    os.write(b);
+  }
+  
+  @Override
+  public void writeBytes(byte[] b, int offset, int length) throws IOException {
+    os.write(b, offset, length);
+  }
+}
--- a/solr/src/test-files/Top50KWiki.utf8
+++ b/solr/src/test-files/Top50KWiki.utf8
--- a/solr/src/test-files/solr/conf/solrconfig-spellchecker.xml
+++ b/solr/src/test-files/solr/conf/solrconfig-spellchecker.xml
@ -31,7 +31,7 @@
  <requestHandler name="standard" class="solr.StandardRequestHandler" />

  <!-- Suggest component -->
-  <searchComponent class="solr.SpellCheckComponent" name="suggest">
+  <searchComponent class="solr.SpellCheckComponent" name="suggest_jaspell">
    <lst name="spellchecker">
      <str name="name">suggest</str>
      <str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
@ -45,6 +45,38 @@
    </lst>
  </searchComponent>

+  <!-- TSTLookup suggest component -->
+  <searchComponent class="solr.SpellCheckComponent" name="suggest_tst">
+    <lst name="spellchecker">
+      <str name="name">suggest_tst</str>
+      <str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
+      <str name="lookupImpl">org.apache.solr.spelling.suggest.tst.TSTLookup</str>
+      <str name="field">suggest</str>
+      <str name="storeDir">suggest_tst</str>
+      <str name="buildOnCommit">true</str>
+
+      <!-- Suggester properties -->
+      <float name="threshold">0.0</float>
+    </lst>
+  </searchComponent>
+
+  <!-- FSTLookup suggest component -->
+  <searchComponent class="solr.SpellCheckComponent" name="suggest_fst">
+    <lst name="spellchecker">
+      <str name="name">suggest_fst</str>
+      <str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
+      <str name="lookupImpl">org.apache.solr.spelling.suggest.fst.FSTLookup</str>
+      <str name="field">suggest</str>
+      <str name="storeDir">suggest_fst</str>
+      <str name="buildOnCommit">true</str>
+
+      <!-- Suggester properties -->
+      <int name="weightBuckets">5</int>
+      <bool name="exactMatchFirst">true</bool>
+    </lst>
+  </searchComponent>
+
+  <!--  The default (jaspell) -->
  <requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest">
    <lst name="defaults">
      <str name="spellcheck">true</str>
@ -52,8 +84,32 @@
      <str name="spellcheck.collate">true</str>
    </lst>
    <arr name="components">
-      <str>suggest</str>
+      <str>suggest_jaspell</str>
    </arr>
  </requestHandler>

+  <!--  tst (ternary tree based) -->
+  <requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_tst">
+    <lst name="defaults">
+      <str name="spellcheck">true</str>
+      <str name="spellcheck.dictionary">suggest_tst</str>
+      <str name="spellcheck.collate">true</str>
+    </lst>
+    <arr name="components">
+      <str>suggest_tst</str>
+    </arr>
+  </requestHandler>
+  
+  <!--  fst (finite state automaton based) -->
+  <requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_fst">
+    <lst name="defaults">
+      <str name="spellcheck">true</str>
+      <str name="spellcheck.dictionary">suggest_fst</str>
+      <str name="spellcheck.collate">false</str>
+    </lst>
+    <arr name="components">
+      <str>suggest_fst</str>
+    </arr>
+  </requestHandler>
+  
 </config>
--- a/solr/src/test/org/apache/solr/spelling/suggest/Average.java
+++ b/solr/src/test/org/apache/solr/spelling/suggest/Average.java
@ -0,0 +1,52 @@
+package org.apache.solr.spelling.suggest;
+
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * Average with standard deviation.
+ */
+final class Average
+{
+    /**
+     * Average (in milliseconds).
+     */
+    public final double avg;
+
+    /**
+     * Standard deviation (in milliseconds).
+     */
+    public final double stddev;
+
+    /**
+     * 
+     */
+    Average(double avg, double stddev)
+    {
+        this.avg = avg;
+        this.stddev = stddev;
+    }
+
+    public String toString()
+    {
+        return String.format(Locale.ENGLISH, "%.0f [+- %.2f]", 
+            avg, stddev);
+    }
+
+    static Average from(List<Double> values)
+    {
+        double sum = 0;
+        double sumSquares = 0;
+
+        for (double l : values)
+        {
+            sum += l;
+            sumSquares += l * l;
+        }
+
+        double avg = sum / (double) values.size();
+        return new Average(
+            (sum / (double) values.size()), 
+            Math.sqrt(sumSquares / (double) values.size() - avg * avg));
+    }
+}
--- a/solr/src/test/org/apache/solr/spelling/suggest/LookupBenchmarkTest.java
+++ b/solr/src/test/org/apache/solr/spelling/suggest/LookupBenchmarkTest.java
@ -0,0 +1,230 @@
+package org.apache.solr.spelling.suggest;
+
+import java.net.URL;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.Random;
+import java.util.concurrent.Callable;
+
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.solr.spelling.suggest.fst.FSTLookup;
+import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
+import org.apache.solr.spelling.suggest.tst.TSTLookup;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import com.google.common.base.Charsets;
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.io.Resources;
+
+/**
+ * Benchmarks tests for implementations of {@link Lookup} interface.
+ */
+@Ignore // COMMENT ME TO RUN BENCHMARKS!
+public class LookupBenchmarkTest {
+  @SuppressWarnings("unchecked")
+  private final List<Class<? extends Lookup>> benchmarkClasses = Lists.newArrayList(
+      JaspellLookup.class, 
+      TSTLookup.class,
+      FSTLookup.class);
+
+  private final static int rounds = 15;
+  private final static int warmup = 5;
+
+  private final int num = 7;
+  private final boolean onlyMorePopular = true;
+
+  private final static Random random = new Random(0xdeadbeef);
+
+  /**
+   * Input term/weight pairs.
+   */
+  private static TermFreq [] dictionaryInput;
+
+  /**
+   * Benchmark term/weight pairs (randomized order).
+   */
+  private static List<TermFreq> benchmarkInput;
+
+  /**
+   * Loads terms and frequencies from Wikipedia (cached).
+   */
+  @BeforeClass
+  public static void setup() throws Exception {
+    List<TermFreq> input = readTop50KWiki();
+    Collections.shuffle(input, random);
+    LookupBenchmarkTest.dictionaryInput = input.toArray(new TermFreq [input.size()]);
+    Collections.shuffle(input, random);
+    LookupBenchmarkTest.benchmarkInput = input;
+  }
+
+  /**
+   * Collect the multilingual input for benchmarks/ tests.
+   */
+  public static List<TermFreq> readTop50KWiki() throws Exception {
+    List<TermFreq> input = Lists.newArrayList();
+    URL resource = Thread.currentThread().getContextClassLoader().getResource("Top50KWiki.utf8");
+    assert resource != null : "Resource missing: Top50KWiki.utf8";
+
+    for (String line : Resources.readLines(resource, Charsets.UTF_8)) {
+      int tab = line.indexOf('|');
+      Assert.assertTrue("No | separator?: " + line, tab >= 0);
+      float weight = Float.parseFloat(line.substring(tab + 1));
+      String key = line.substring(0, tab);
+      input.add(new TermFreq(key, weight));
+    }
+    return input;
+  }
+
+  /**
+   * Test construction time.
+   */
+  @Test
+  public void testConstructionTime() throws Exception {
+    System.err.println("-- construction time");
+    for (final Class<? extends Lookup> cls : benchmarkClasses) {
+      BenchmarkResult result = measure(new Callable<Integer>() {
+        public Integer call() throws Exception {
+          final Lookup lookup = buildLookup(cls, dictionaryInput);          
+          return lookup.hashCode();
+        }
+      });
+
+      System.err.println(
+          String.format(Locale.ENGLISH, "%-15s input: %d, time[ms]: %s",
+              cls.getSimpleName(),
+              dictionaryInput.length,
+              result.average.toString()));
+    }
+  }
+
+  /**
+   * Test memory required for the storage.
+   */
+  @Test
+  public void testStorageNeeds() throws Exception {
+    System.err.println("-- RAM consumption");
+    final RamUsageEstimator rue = new RamUsageEstimator();
+    for (Class<? extends Lookup> cls : benchmarkClasses) {
+      Lookup lookup = buildLookup(cls, dictionaryInput);
+      System.err.println(
+          String.format(Locale.ENGLISH, "%-15s size[B]:%,13d",
+              lookup.getClass().getSimpleName(), 
+              rue.estimateRamUsage(lookup)));
+    }
+  }
+
+  /**
+   * Create {@link Lookup} instance and populate it. 
+   */
+  private Lookup buildLookup(Class<? extends Lookup> cls, TermFreq[] input) throws Exception {
+    Lookup lookup = cls.newInstance();
+    lookup.build(new TermFreqArrayIterator(input));
+    return lookup;
+  }
+
+  /**
+   * Test performance of lookup on full hits.
+   */
+  @Test
+  public void testPerformanceOnFullHits() throws Exception {
+    final int minPrefixLen = 100;
+    final int maxPrefixLen = 200;
+    runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
+  }
+
+  /**
+   * Test performance of lookup on longer term prefixes (6-9 letters or shorter).
+   */
+  @Test
+  public void testPerformanceOnPrefixes6_9() throws Exception {
+    final int minPrefixLen = 6;
+    final int maxPrefixLen = 9;
+    runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
+  }
+
+  /**
+   * Test performance of lookup on short term prefixes (2-4 letters or shorter).
+   */
+  @Test
+  public void testPerformanceOnPrefixes2_4() throws Exception {
+    final int minPrefixLen = 2;
+    final int maxPrefixLen = 4;
+    runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
+  }
+
+  /**
+   * Run the actual benchmark. 
+   */
+  public void runPerformanceTest(final int minPrefixLen, final int maxPrefixLen, 
+      final int num, final boolean onlyMorePopular) throws Exception {
+    System.err.println(String.format(Locale.ENGLISH,
+        "-- prefixes: %d-%d, num: %d, onlyMorePopular: %s",
+        minPrefixLen, maxPrefixLen, num, onlyMorePopular));
+
+    for (Class<? extends Lookup> cls : benchmarkClasses) {
+      final Lookup lookup = buildLookup(cls, dictionaryInput);
+
+      final List<String> input = Lists.newArrayList(Iterables.transform(benchmarkInput, new Function<TermFreq, String>() {
+        public String apply(TermFreq tf) {
+          return tf.term.substring(0, Math.min(tf.term.length(), 
+              minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)));
+        }
+      }));
+
+      BenchmarkResult result = measure(new Callable<Integer>() {
+        public Integer call() throws Exception {
+          int v = 0;
+          for (String term : input) {
+            v += lookup.lookup(term, onlyMorePopular, num).size();
+          }
+          return v;
+        }
+      });
+
+      System.err.println(
+          String.format(Locale.ENGLISH, "%-15s queries: %d, time[ms]: %s, ~qps: %.0f",
+              lookup.getClass().getSimpleName(),
+              input.size(),
+              result.average.toString(),
+              input.size() / result.average.avg));
+    }
+  }
+
+  /**
+   * Do the measurements.
+   */
+  private BenchmarkResult measure(Callable<Integer> callable) {
+    final double NANOS_PER_MS = 1000000;
+
+    try {
+      List<Double> times = Lists.newArrayList();
+      for (int i = 0; i < warmup + rounds; i++) {
+          final long start = System.nanoTime();
+          guard = callable.call().intValue();
+          times.add((System.nanoTime() - start) / NANOS_PER_MS);
+      }
+      return new BenchmarkResult(times, warmup, rounds);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  /** Guard against opts. */
+  @SuppressWarnings("unused")
+  private static volatile int guard;
+
+  private static class BenchmarkResult {
+    /** Average time per round (ms). */
+    public final Average average;
+
+    public BenchmarkResult(List<Double> times, int warmup, int rounds) {
+      this.average = Average.from(times.subList(warmup, times.size()));
+    }
+  }
+}
--- a/solr/src/test/org/apache/solr/spelling/suggest/PersistenceTest.java
+++ b/solr/src/test/org/apache/solr/spelling/suggest/PersistenceTest.java
@ -19,62 +19,74 @@ package org.apache.solr.spelling.suggest;
 import java.io.File;

 import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.spelling.suggest.fst.FSTLookup;
 import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
 import org.apache.solr.spelling.suggest.tst.TSTLookup;
 import org.junit.Test;

 public class PersistenceTest extends SolrTestCaseJ4 {
-  
-  public static final String[] keys = new String[] {
-    "one",
-    "two",
-    "three",
-    "four",
-    "oneness",
-    "onerous",
-    "onesimus",
-    "twofold",
-    "twonk",
-    "thrive",
-    "through",
-    "threat",
-    "foundation",
-    "fourier",
-    "fourty"
-  };
+  public final String[] keys = new String[] {
+      "one", 
+      "two", 
+      "three", 
+      "four",
+      "oneness", 
+      "onerous", 
+      "onesimus", 
+      "twofold", 
+      "twonk", 
+      "thrive",
+      "through", 
+      "threat", 
+      "foundation", 
+      "fourier", 
+      "fourty"};

  @Test
  public void testTSTPersistence() throws Exception {
-    TSTLookup lookup = new TSTLookup();
-    for (String k : keys) {
-      lookup.add(k, new Float(k.length()));
-    }
-    File storeDir = new File(TEST_HOME());
-    lookup.store(storeDir);
-    lookup = new TSTLookup();
-    lookup.load(storeDir);
-    for (String k : keys) {
-      Float val = (Float)lookup.get(k);
-      assertNotNull(k, val);
-      assertEquals(k, k.length(), val.intValue());
-    }
+    runTest(TSTLookup.class, true);
  }
  
  @Test
  public void testJaspellPersistence() throws Exception {
-    JaspellLookup lookup = new JaspellLookup();
-    for (String k : keys) {
-      lookup.add(k, new Float(k.length()));
-    }
-    File storeDir = new File(TEST_HOME());
-    lookup.store(storeDir);
-    lookup = new JaspellLookup();
-    lookup.load(storeDir);
-    for (String k : keys) {
-      Float val = (Float)lookup.get(k);
-      assertNotNull(k, val);
-      assertEquals(k, k.length(), val.intValue());
-    }
+    runTest(JaspellLookup.class, true);
+  }
+
+  @Test
+  public void testFSTPersistence() throws Exception {
+    runTest(FSTLookup.class, false);
  }
  
+  private void runTest(Class<? extends Lookup> lookupClass,
+      boolean supportsExactWeights) throws Exception {
+
+    // Add all input keys.
+    Lookup lookup = lookupClass.newInstance();
+    TermFreq[] keys = new TermFreq[this.keys.length];
+    for (int i = 0; i < keys.length; i++)
+      keys[i] = new TermFreq(this.keys[i], (float) i);
+    lookup.build(new TermFreqArrayIterator(keys));
+
+    // Store the suggester.
+    File storeDir = new File(TEST_HOME());
+    lookup.store(storeDir);
+
+    // Re-read it from disk.
+    lookup = lookupClass.newInstance();
+    lookup.load(storeDir);
+
+    // Assert validity.
+    float previous = Float.NEGATIVE_INFINITY;
+    for (TermFreq k : keys) {
+      Float val = (Float) lookup.get(k.term);
+      assertNotNull(k.term, val);
+
+      if (supportsExactWeights) { 
+        assertEquals(k.term, Float.valueOf(k.v), val);
+      } else {
+        assertTrue(val + ">=" + previous, val >= previous);
+        previous = val.floatValue();
+      }
+    }
+  }
 }
--- a/solr/src/test/org/apache/solr/spelling/suggest/SuggesterFSTTest.java
+++ b/solr/src/test/org/apache/solr/spelling/suggest/SuggesterFSTTest.java
@ -0,0 +1,7 @@
+package org.apache.solr.spelling.suggest;
+
+public class SuggesterFSTTest extends SuggesterTest {
+  public SuggesterFSTTest() {
+    super.requestUri = "/suggest_fst";
+  }
+}
--- a/solr/src/test/org/apache/solr/spelling/suggest/SuggesterTSTTest.java
+++ b/solr/src/test/org/apache/solr/spelling/suggest/SuggesterTSTTest.java
@ -0,0 +1,7 @@
+package org.apache.solr.spelling.suggest;
+
+public class SuggesterTSTTest extends SuggesterTest {
+  public SuggesterTSTTest() {
+    super.requestUri = "/suggest_tst";
+  }
+}
--- a/solr/src/test/org/apache/solr/spelling/suggest/SuggesterTest.java
+++ b/solr/src/test/org/apache/solr/spelling/suggest/SuggesterTest.java
@ -17,28 +17,19 @@

 package org.apache.solr.spelling.suggest;

-import org.apache.lucene.util.RamUsageEstimator;
+import java.io.File;
+
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.common.params.SpellingParams;
-import org.apache.solr.spelling.suggest.Lookup.LookupResult;
-import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
-import org.apache.solr.spelling.suggest.tst.TSTLookup;
-import org.apache.solr.util.TermFreqIterator;
 import org.junit.BeforeClass;
-import org.junit.Ignore;
 import org.junit.Test;

-import com.google.common.collect.Lists;
-
-import java.io.File;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Random;
-
 public class SuggesterTest extends SolrTestCaseJ4 {
+  /**
+   * Expected URI at which the given suggester will live.
+   */
+  protected String requestUri = "/suggest";
+
  @BeforeClass
  public static void beforeClass() throws Exception {
    initCore("solrconfig-spellchecker.xml","schema-spellchecker.xml");
@ -59,10 +50,9 @@ public class SuggesterTest extends SolrTestCaseJ4 {
  @Test
  public void testSuggestions() throws Exception {
    addDocs();
-
    assertU(commit()); // configured to do a rebuild on commit

-    assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
+    assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']",
        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']",
        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']"
@ -82,12 +72,12 @@ public class SuggesterTest extends SolrTestCaseJ4 {
    dataDir = data;
    configString = config;
    initCore();
-    assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
+    assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
            "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']",
            "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']",
            "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']"
        );
-    
+
    // restore the property
    System.setProperty("solr.test.leavedatadir", leaveData);
  }
@ -96,132 +86,13 @@ public class SuggesterTest extends SolrTestCaseJ4 {
  public void testRebuild() throws Exception {
    addDocs();
    assertU(commit());
-    assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
-        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
+    assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
+      "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
    assertU(adoc("id", "4",
        "text", "actually"
       ));
    assertU(commit());
-    assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
-    "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
-  }
-
-  
-  private TermFreqIterator getTFIT() {
-    final int count = 100000;
-    TermFreqIterator tfit = new TermFreqIterator() {
-      Random r = new Random(1234567890L);
-      Random r1 = new Random(1234567890L);
-      int pos;
-
-      public float freq() {
-        return r1.nextInt(4);
-      }
-
-      public boolean hasNext() {
-        return pos < count;
-      }
-
-      public String next() {
-        pos++;
-        return Long.toString(r.nextLong());
-      }
-
-      public void remove() {
-        throw new UnsupportedOperationException();
-      }
-      
-    };
-    return tfit;
-  }
-  
-  static class Bench {
-    long buildTime;
-    long lookupTime;
-  }
-
-  @Test @Ignore
-  public void testBenchmark() throws Exception {
-    final List<Class<? extends Lookup>> benchmarkClasses = Lists.newArrayList();  
-    benchmarkClasses.add(JaspellLookup.class);
-    benchmarkClasses.add(TSTLookup.class);
-
-    // Run a single pass just to see if everything works fine and provide size estimates.
-    final RamUsageEstimator rue = new RamUsageEstimator();
-    for (Class<? extends Lookup> cls : benchmarkClasses) {
-      Lookup lookup = singleBenchmark(cls, null);
-      System.err.println(
-          String.format(Locale.ENGLISH,
-              "%20s, size[B]=%,d",
-              lookup.getClass().getSimpleName(), 
-              rue.estimateRamUsage(lookup)));
-    }
-
-    int warmupCount = 10;
-    int measuredCount = 100;
-    for (Class<? extends Lookup> cls : benchmarkClasses) {
-      Bench b = fullBenchmark(cls, warmupCount, measuredCount);
-      System.err.println(String.format(Locale.ENGLISH,
-          "%s: buildTime[ms]=%,d lookupTime[ms]=%,d",
-          cls.getSimpleName(),
-          (b.buildTime / measuredCount),
-          (b.lookupTime / measuredCount / 1000000)));
-    }
-  }
-
-  private Lookup singleBenchmark(Class<? extends Lookup> cls, Bench bench) throws Exception {
-    Lookup lookup = cls.newInstance();
-
-    long start = System.currentTimeMillis();
-    lookup.build(getTFIT());
-    long buildTime = System.currentTimeMillis() - start;
-
-    TermFreqIterator tfit = getTFIT();
-    long elapsed = 0;
-    while (tfit.hasNext()) {
-      String key = tfit.next();
-      // take only the first part of the key
-      int len = key.length() > 4 ? key.length() / 3 : 2;
-      String prefix = key.substring(0, len);
-      start = System.nanoTime();
-      List<LookupResult> res = lookup.lookup(prefix, true, 10);
-      elapsed += System.nanoTime() - start;
-      assertTrue(res.size() > 0);
-      for (LookupResult lr : res) {
-        assertTrue(lr.key.startsWith(prefix));
-      }
-    }
-
-    if (bench != null) {
-      bench.buildTime += buildTime;
-      bench.lookupTime +=  elapsed;
-    }
-
-    return lookup;
-  }
-
-  private Bench fullBenchmark(Class<? extends Lookup> cls, int warmupCount, int measuredCount) throws Exception {
-    System.err.println("* Running " + measuredCount + " iterations for " + cls.getSimpleName() + " ...");
-    System.err.println("  - warm-up " + warmupCount + " iterations...");
-    for (int i = 0; i < warmupCount; i++) {
-      System.runFinalization();
-      System.gc();
-      singleBenchmark(cls, null);
-    }
-
-    Bench b = new Bench();
-    System.err.print("  - main iterations:"); System.err.flush();
-    for (int i = 0; i < measuredCount; i++) {
-      System.runFinalization();
-      System.gc();
-      singleBenchmark(cls, b);
-      if (i > 0 && (i % 10 == 0)) {
-        System.err.print(" " + i);
-        System.err.flush();
-      }
-    }
-
-    System.err.println();
-    return b;
+    assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
+      "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
  }
 }
--- a/solr/src/test/org/apache/solr/spelling/suggest/TermFreq.java
+++ b/solr/src/test/org/apache/solr/spelling/suggest/TermFreq.java
@ -0,0 +1,11 @@
+package org.apache.solr.spelling.suggest;
+
+public final class TermFreq {
+  public final String term;
+  public final float v;
+
+  public TermFreq(String term, float v) {
+    this.term = term;
+    this.v = v;
+  }
+}
--- a/solr/src/test/org/apache/solr/spelling/suggest/TermFreqArrayIterator.java
+++ b/solr/src/test/org/apache/solr/spelling/suggest/TermFreqArrayIterator.java
@ -0,0 +1,40 @@
+package org.apache.solr.spelling.suggest;
+
+import java.util.Arrays;
+import java.util.Iterator;
+
+import org.apache.solr.util.TermFreqIterator;
+
+/**
+ * A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
+ */
+public final class TermFreqArrayIterator implements TermFreqIterator {
+  private final Iterator<TermFreq> i;
+  private TermFreq current;
+
+  public TermFreqArrayIterator(Iterator<TermFreq> i) {
+    this.i = i;
+  }
+
+  public TermFreqArrayIterator(TermFreq [] i) {
+    this(Arrays.asList(i));
+  }
+
+  public TermFreqArrayIterator(Iterable<TermFreq> i) {
+    this(i.iterator());
+  }
+  
+  public float freq() {
+    return current.v;
+  }
+  
+  public boolean hasNext() {
+    return i.hasNext();
+  }
+  
+  public String next() {
+    return (current = i.next()).term;
+  }
+
+  public void remove() { throw new UnsupportedOperationException(); }
+}
--- a/solr/src/test/org/apache/solr/spelling/suggest/fst/FSTLookupTest.java
+++ b/solr/src/test/org/apache/solr/spelling/suggest/fst/FSTLookupTest.java
@ -0,0 +1,155 @@
+package org.apache.solr.spelling.suggest.fst;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+import java.util.Random;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.solr.spelling.suggest.Lookup.LookupResult;
+import org.apache.solr.spelling.suggest.LookupBenchmarkTest;
+import org.apache.solr.spelling.suggest.TermFreq;
+import org.apache.solr.spelling.suggest.TermFreqArrayIterator;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Unit tests for {@link FSTLookup}.
+ */
+public class FSTLookupTest extends LuceneTestCase {
+  public static TermFreq tf(String t, float v) {
+    return new TermFreq(t, v);
+  }
+
+  private FSTLookup lookup;
+
+  @Before
+  public void prepare() throws Exception {
+    final TermFreq[] keys = new TermFreq[] {
+        tf("one", 0.5f),
+        tf("oneness", 1),
+        tf("onerous", 1),
+        tf("onesimus", 1),
+        tf("two", 1),
+        tf("twofold", 1),
+        tf("twonk", 1),
+        tf("thrive", 1),
+        tf("through", 1),
+        tf("threat", 1),
+        tf("three", 1),
+        tf("foundation", 1),
+        tf("fourier", 1),
+        tf("four", 1),
+        tf("fourty", 1),
+        tf("xo", 1),
+      };
+
+      lookup = new FSTLookup();
+      lookup.build(new TermFreqArrayIterator(keys));
+  }
+
+  @Test
+  public void testExactMatchHighPriority() throws Exception {
+    assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0");
+  }
+
+  @Test
+  public void testExactMatchLowPriority() throws Exception {
+    assertMatchEquals(lookup.lookup("one", true, 2), 
+        "one/0.0",
+        "oneness/1.0");
+  }
+
+  @Test
+  public void testMiss() throws Exception {
+    assertMatchEquals(lookup.lookup("xyz", true, 1));
+  }
+
+  @Test
+  public void testAlphabeticWithWeights() throws Exception {
+    assertEquals(0, lookup.lookup("xyz", false, 1).size());
+  }
+
+  @Test
+  public void testFullMatchList() throws Exception {
+    assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE),
+        "oneness/1.0", 
+        "onerous/1.0",
+        "onesimus/1.0", 
+        "one/0.0");
+  }
+
+  @Test
+  public void testMultilingualInput() throws Exception {
+    List<TermFreq> input = LookupBenchmarkTest.readTop50KWiki();
+
+    lookup = new FSTLookup();
+    lookup.build(new TermFreqArrayIterator(input));
+
+    for (TermFreq tf : input) {
+      assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null);
+      assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key);
+    }
+  }
+
+  @Test
+  public void testEmptyInput() throws Exception {
+    lookup = new FSTLookup();
+    lookup.build(new TermFreqArrayIterator(new TermFreq[0]));
+    
+    assertMatchEquals(lookup.lookup("", true, 10));
+  }
+
+  @Test
+  public void testRandom() throws Exception {
+    List<TermFreq> freqs = Lists.newArrayList();
+    Random rnd = random;
+    for (int i = 0; i < 5000; i++) {
+      freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100)));
+    }
+    lookup = new FSTLookup();
+    lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()])));
+
+    for (TermFreq tf : freqs) {
+      final String term = tf.term;
+      for (int i = 1; i < term.length(); i++) {
+        String prefix = term.substring(0, i);
+        for (LookupResult lr : lookup.lookup(prefix, true, 10)) {
+          Assert.assertTrue(lr.key.startsWith(prefix));
+        }
+      }
+    }
+  }
+
+  private void assertMatchEquals(List<LookupResult> res, String... expected) {
+    String [] result = new String [res.size()];
+    for (int i = 0; i < res.size(); i++)
+      result[i] = res.get(i).toString();
+    
+    if (!Arrays.equals(expected, result)) {
+      int colLen = Math.max(maxLen(expected), maxLen(result));
+      
+      StringBuilder b = new StringBuilder();
+      String format = "%" + colLen + "s  " + "%" + colLen + "s\n"; 
+      b.append(String.format(Locale.ENGLISH, format, "Expected", "Result"));
+      for (int i = 0; i < Math.max(result.length, expected.length); i++) {
+        b.append(String.format(Locale.ENGLISH, format, 
+            i < expected.length ? expected[i] : "--", 
+            i < result.length ? result[i] : "--"));
+      }
+
+      System.err.println(b.toString());
+      fail("Expected different output:\n" + b.toString());
+    }
+  }
+
+  private int maxLen(String[] result) {
+    int len = 0;
+    for (String s : result)
+      len = Math.max(len, s.length());
+    return len;
+  }
+}