SOLR-2378: A new, automaton-based, implementation of suggest (autocomplete)

component, offering an order of magnitude smaller memory consumption
compared to ternary trees and jaspell and very fast lookups at runtime.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1092136 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2011-04-14 11:16:43 +00:00
parent 2133423e2b
commit 191706df70
17 changed files with 51252 additions and 191 deletions

View File

@ -61,6 +61,11 @@ Detailed Change List
New Features New Features
---------------------- ----------------------
* SOLR-2378: A new, automaton-based, implementation of suggest (autocomplete)
component, offering an order of magnitude smaller memory consumption
compared to ternary trees and jaspell and very fast lookups at runtime.
(Dawid Weiss)
* SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now * SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now
supports "percentages" which get evaluated relative the current size of supports "percentages" which get evaluated relative the current size of
the cache when warming happens. the cache when warming happens.

View File

@ -162,7 +162,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
} else { } else {
throw new SolrException(SolrException.ErrorCode.NOT_FOUND, throw new SolrException(SolrException.ErrorCode.NOT_FOUND,
"Specified dictionary does not exist."); "Specified dictionary does not exist: " + getDictionaryName(params));
} }
} }
} }

View File

@ -12,7 +12,6 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.util.TermFreqIterator; import org.apache.solr.util.TermFreqIterator;
public abstract class Lookup { public abstract class Lookup {
/** /**
* Result of a lookup. * Result of a lookup.
*/ */

View File

@ -0,0 +1,556 @@
package org.apache.solr.spelling.suggest.fst;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.fst.Builder;
import org.apache.lucene.util.automaton.fst.FST;
import org.apache.lucene.util.automaton.fst.FST.Arc;
import org.apache.lucene.util.automaton.fst.NoOutputs;
import org.apache.lucene.util.automaton.fst.Outputs;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.spelling.suggest.Lookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.apache.solr.util.TermFreqIterator;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
/**
* Finite state automata based implementation of {@link Lookup} query
* suggestion/ autocomplete interface.
*
* <h2>Implementation details</h2>
*
* <p>The construction step in {@link #build(TermFreqIterator)} works as follows:
* <ul>
* <li>A set of input terms (String) and weights (float) is given.</li>
* <li>The range of weights is determined and then all weights are discretized into a fixed set
* of values ({@link #buckets}).
* Note that this means that minor changes in weights may be lost during automaton construction.
* In general, this is not a big problem because the "priorities" of completions can be split
* into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
* If you need exact, fine-grained weights, use {@link TSTLookup} instead.<li>
* <li>All terms in the input are preprended with a synthetic pseudo-character being the weight
* of that term. For example a term <code>abc</code> with a discretized weight equal '1' would
* become <code>1abc</code>.</li>
* <li>The terms are sorted by their raw value of utf16 character values (including the synthetic
* term in front).</li>
* <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
* arcs labeled with all possible weights. We cache all these arcs, highest-weight first.</li>
* </ul>
*
* <p>At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
* <ul>
* <li>For each possible term weight encoded in the automaton (cached arcs from the root above),
* starting with the highest one, we descend along the path of the input key. If the key is not
* a prefix of a sequence in the automaton (path ends prematurely), we exit immediately.
* No completions.
* <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire
* subautomaton (all paths) starting from this node form the key's completions.</b> We start
* the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
* suggestion to the list of results (the weight of this suggestion is constant and equal to the
* root path we started from). The tricky part is that because automaton edges are sorted and
* we scan depth-first, we can terminate the entire procedure as soon as we collect enough
* suggestions the user requested.
* <li>In case the number of suggestions collected in the step above is still insufficient,
* we proceed to the next (smaller) weight leaving the root node and repeat the same
* algorithm again.
* </li>
* </ul>
*
* <h2>Runtime behavior and performance characteristic</h2>
*
* <p>The algorithm described above is optimized for finding suggestions to short prefixes
* in a top-weights-first order. This is probably the most common use case: it allows
* presenting suggestions early and sorts them by the global frequency (and then alphabetically).
*
* <p>If there is an exact match in the automaton, it is returned first on the results
* list (even with by-weight sorting).
*
* <p>Note that the maximum lookup time for <b>any prefix</b>
* is the time of descending to the subtree, plus traversal of the subtree up to the number
* of requested suggestions (because they are already presorted by weight on the root level
* and alphabetically at any node level).
*
* <p>To order alphabetically only (no ordering by priorities), use identical term weights
* for all terms. Alphabetical suggestions are returned even if non-constant weights are
* used, but the algorithm for doing this is suboptimal.
*
* <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
* nothing else.
*/
public class FSTLookup extends Lookup {
/** A structure for a single entry (for sorting/ preprocessing). */
private static class Entry {
char [] term;
float weight;
public Entry(char [] term, float freq) {
this.term = term;
this.weight = freq;
}
}
/**
* The number of separate buckets for weights (discretization). The more buckets,
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
* will not decrease for prefixes which have highly-weighted completions (because these
* are filled-in first), but will decrease significantly for low-weighted terms (but
* these should be infrequent, so it is all right).
*
* <p>The number of buckets must be within [1, 255] range.
*/
public static final String WEIGHT_BUCKETS = "weightBuckets";
/**
* If <code>true</code>, exact suggestions are returned first, even if they are prefixes
* of other strings in the automaton (possibly with larger weights).
*/
public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
/** Serialized automaton file name (storage). */
public static final String FILENAME = "fst.dat";
/** An empty result. */
private static final List<LookupResult> EMPTY_RESULT = Lists.newArrayList();
/**
* @see #WEIGHT_BUCKETS
*/
private int buckets = 10;
/**
* #see #EXACT_MATCH_FIRST
*/
private boolean exactMatchFirst = true;
/**
* Finite state automaton encoding all the lookup terms. See class
* notes for details.
*/
private FST<Object> automaton;
/**
* An array of arcs leaving the root automaton state and encoding weights of all
* completions in their sub-trees.
*/
private Arc<Object> [] rootArcs;
/* */
@Override
@SuppressWarnings("rawtypes")
public void init(NamedList config, SolrCore core) {
this.buckets = config.get(WEIGHT_BUCKETS) != null
? Integer.parseInt(config.get(WEIGHT_BUCKETS).toString())
: 10;
this.exactMatchFirst = config.get(EXACT_MATCH_FIRST) != null
? Boolean.valueOf(config.get(EXACT_MATCH_FIRST).toString())
: true;
}
/* */
@Override
public void build(TermFreqIterator tfit) throws IOException {
// Buffer the input because we will need it twice: for calculating
// weights distribution and for the actual automata building.
List<Entry> entries = Lists.newArrayList();
while (tfit.hasNext()) {
String term = tfit.next();
char [] termChars = new char [term.length() + 1]; // add padding for weight.
for (int i = 0; i < term.length(); i++)
termChars[i + 1] = term.charAt(i);
entries.add(new Entry(termChars, tfit.freq()));
}
// Distribute weights into at most N buckets. This is a form of discretization to
// limit the number of possible weights so that they can be efficiently encoded in the
// automaton.
//
// It is assumed the distribution of weights is _linear_ so proportional division
// of [min, max] range will be enough here. Other approaches could be to sort
// weights and divide into proportional ranges.
if (entries.size() > 0) {
redistributeWeightsProportionalMinMax(entries, buckets);
encodeWeightPrefix(entries);
}
// Build the automaton (includes input sorting) and cache root arcs in order from the highest,
// to the lowest weight.
this.automaton = buildAutomaton(entries);
cacheRootArcs();
}
/**
* Cache the root node's output arcs starting with completions with the highest weights.
*/
@SuppressWarnings("unchecked")
private void cacheRootArcs() throws IOException {
if (automaton != null) {
List<Arc<Object>> rootArcs = Lists.newArrayList();
Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
automaton.readFirstTargetArc(arc, arc);
while (true) {
rootArcs.add(new Arc<Object>().copyFrom(arc));
if (arc.isLast())
break;
automaton.readNextArc(arc);
}
Collections.reverse(rootArcs); // we want highest weights first.
this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]);
}
}
/**
* Not implemented.
*/
@Override
public boolean add(String key, Object value) {
// This implementation does not support ad-hoc additions (all input
// must be sorted for the builder).
return false;
}
/**
* Get the (approximated) weight of a single key (if there is a perfect match
* for it in the automaton).
*
* @return Returns the approximated weight of the input key or <code>null</code>
* if not found.
*/
@Override
public Float get(String key) {
return getExactMatchStartingFromRootArc(0, key);
}
/**
* Returns the first exact match by traversing root arcs, starting from
* the arc <code>i</code>.
*
* @param i The first root arc index in {@link #rootArcs} to consider when
* matching.
*/
private Float getExactMatchStartingFromRootArc(int i, String key) {
// Get the UTF-8 bytes representation of the input key.
try {
final FST.Arc<Object> scratch = new FST.Arc<Object>();
for (; i < rootArcs.length; i++) {
final FST.Arc<Object> rootArc = rootArcs[i];
final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
// Descend into the automaton using the key as prefix.
if (descendWithPrefix(arc, key)) {
automaton.readFirstTargetArc(arc, arc);
if (arc.label == FST.END_LABEL) {
// Prefix-encoded weight.
return rootArc.label / (float) buckets;
}
}
}
} catch (IOException e) {
// Should never happen, but anyway.
throw new RuntimeException(e);
}
return null;
}
/**
* Lookup autocomplete suggestions to <code>key</code>.
*
* @param key The prefix to which suggestions should be sought.
* @param onlyMorePopular Return most popular suggestions first. This is the default
* behavior for this implementation. Setting it to <code>false</code> has no effect (use
* constant term weights to sort alphabetically only).
* @param num At most this number of suggestions will be returned.
* @return Returns the suggestions, sorted by their approximated weight first (decreasing)
* and then alphabetically (utf16 codepoint order).
*/
@Override
public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) {
if (key.length() == 0 || automaton == null) {
// Keep the result an ArrayList to keep calls monomorphic.
return EMPTY_RESULT;
}
try {
if (!onlyMorePopular && rootArcs.length > 1) {
// We could emit a warning here (?). An optimal strategy for alphabetically sorted
// suggestions would be to add them with a constant weight -- this saves unnecessary
// traversals and sorting.
return lookupSortedAlphabetically(key, num);
} else {
return lookupSortedByWeight(key, num, true);
}
} catch (IOException e) {
// Should never happen, but anyway.
throw new RuntimeException(e);
}
}
/**
* Lookup suggestions sorted alphabetically <b>if weights are not constant</b>. This
* is a workaround: in general, use constant weights for alphabetically sorted result.
*/
private List<LookupResult> lookupSortedAlphabetically(String key, int num) throws IOException {
// Greedily get num results from each weight branch.
List<LookupResult> res = lookupSortedByWeight(key, num, false);
// Sort and trim.
Collections.sort(res, new Comparator<LookupResult>() {
@Override
public int compare(LookupResult o1, LookupResult o2) {
return o1.key.compareTo(o2.key);
}
});
if (res.size() > num) {
res = res.subList(0, num);
}
return res;
}
/**
* Lookup suggestions sorted by weight (descending order).
*
* @param greedy If <code>true</code>, the routine terminates immediately when <code>num</code>
* suggestions have been collected. If <code>false</code>, it will collect suggestions from
* all weight arcs (needed for {@link #lookupSortedAlphabetically}.
*/
private ArrayList<LookupResult> lookupSortedByWeight(String key, int num, boolean greedy) throws IOException {
final ArrayList<LookupResult> res = new ArrayList<LookupResult>(Math.min(10, num));
final StringBuilder output = new StringBuilder(key);
final int matchLength = key.length() - 1;
for (int i = 0; i < rootArcs.length; i++) {
final FST.Arc<Object> rootArc = rootArcs[i];
final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
// Descend into the automaton using the key as prefix.
if (descendWithPrefix(arc, key)) {
// Prefix-encoded weight.
final float weight = rootArc.label / (float) buckets;
// A subgraph starting from the current node has the completions
// of the key prefix. The arc we're at is the last key's byte,
// so we will collect it too.
output.setLength(matchLength);
if (collect(res, num, weight, output, arc) && greedy) {
// We have enough suggestion to return immediately. Keep on looking for an
// exact match, if requested.
if (exactMatchFirst) {
Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key);
if (exactMatchWeight != null) {
res.add(0, new LookupResult(key, exactMatchWeight));
while (res.size() > num) {
res.remove(res.size() - 1);
}
}
}
break;
}
}
}
return res;
}
/**
* Descend along the path starting at <code>arc</code> and going through
* bytes in <code>utf8</code> argument.
*
* @param arc The starting arc. This argument is modified in-place.
* @param term The term to descend with.
* @return If <code>true</code>, <code>arc</code> will be set to the arc matching
* last byte of <code>utf8</code>. <code>false</code> is returned if no such
* prefix <code>utf8</code> exists.
*/
private boolean descendWithPrefix(Arc<Object> arc, String term) throws IOException {
final int max = term.length();
for (int i = 0; i < max; i++) {
if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) {
// No matching prefixes, return an empty result.
return false;
}
}
return true;
}
/**
* Recursive collect lookup results from the automaton subgraph starting at <code>arc</code>.
*
* @param num Maximum number of results needed (early termination).
* @param weight Weight of all results found during this collection.
*/
private boolean collect(List<LookupResult> res, int num, float weight, StringBuilder output, Arc<Object> arc) throws IOException {
output.append((char) arc.label);
automaton.readFirstTargetArc(arc, arc);
while (true) {
if (arc.label == FST.END_LABEL) {
res.add(new LookupResult(output.toString(), weight));
if (res.size() >= num)
return true;
} else {
int save = output.length();
if (collect(res, num, weight, output, new Arc<Object>().copyFrom(arc))) {
return true;
}
output.setLength(save);
}
if (arc.isLast()) {
break;
}
automaton.readNextArc(arc);
}
return false;
}
/**
* Builds the final automaton from a list of entries.
*/
private FST<Object> buildAutomaton(List<Entry> entries) throws IOException {
if (entries.size() == 0)
return null;
// Sort by utf16 (raw char value)
final Comparator<Entry> comp = new Comparator<Entry>() {
public int compare(Entry o1, Entry o2) {
char [] ch1 = o1.term;
char [] ch2 = o2.term;
int len1 = ch1.length;
int len2 = ch2.length;
int max = Math.min(len1, len2);
for (int i = 0; i < max; i++) {
int v = ch1[i] - ch2[i];
if (v != 0) return v;
}
return len1 - len2;
}
};
Collections.sort(entries, comp);
// Avoid duplicated identical entries, if possible. This is required because
// it breaks automaton construction otherwise.
int len = entries.size();
int j = 0;
for (int i = 1; i < len; i++) {
if (comp.compare(entries.get(j), entries.get(i)) != 0) {
entries.set(++j, entries.get(i));
}
}
entries = entries.subList(0, j + 1);
// Build the automaton.
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object empty = outputs.getNoOutput();
final Builder<Object> builder =
new Builder<Object>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
final IntsRef scratchIntsRef = new IntsRef(10);
for (Entry e : entries) {
final int termLength = scratchIntsRef.length = e.term.length;
scratchIntsRef.grow(termLength);
final int [] ints = scratchIntsRef.ints;
final char [] chars = e.term;
for (int i = termLength; --i >= 0;) {
ints[i] = chars[i];
}
builder.add(scratchIntsRef, empty);
}
return builder.finish();
}
/**
* Prepends the entry's weight to each entry, encoded as a single byte, so that the
* root automaton node fans out to all possible priorities, starting with the arc that has
* the highest weights.
*/
private void encodeWeightPrefix(List<Entry> entries) {
for (Entry e : entries) {
int weight = (int) e.weight;
assert (weight >= 0 && weight <= buckets) :
"Weight out of range: " + weight + " [" + buckets + "]";
// There should be a single empty char reserved in front for the weight.
e.term[0] = (char) weight;
}
}
/**
* Split [min, max] range into buckets, reassigning weights. Entries' weights are
* remapped to [0, buckets] range (so, buckets + 1 buckets, actually).
*/
private void redistributeWeightsProportionalMinMax(List<Entry> entries, int buckets) {
float min = entries.get(0).weight;
float max = min;
for (Entry e : entries) {
min = Math.min(e.weight, min);
max = Math.max(e.weight, max);
}
final float range = max - min;
for (Entry e : entries) {
e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor()
}
}
/**
* Deserialization from disk.
*/
@Override
public synchronized boolean load(File storeDir) throws IOException {
File data = new File(storeDir, FILENAME);
if (!data.exists() || !data.canRead()) {
return false;
}
InputStream is = new BufferedInputStream(new FileInputStream(data));
try {
this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton());
cacheRootArcs();
} finally {
Closeables.closeQuietly(is);
}
return true;
}
/**
* Serialization to disk.
*/
@Override
public synchronized boolean store(File storeDir) throws IOException {
if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
return false;
}
if (this.automaton == null)
return false;
File data = new File(storeDir, FILENAME);
OutputStream os = new BufferedOutputStream(new FileOutputStream(data));
try {
this.automaton.save(new OutputStreamDataOutput(os));
} finally {
Closeables.closeQuietly(os);
}
return true;
}
}

View File

@ -0,0 +1,31 @@
package org.apache.solr.spelling.suggest.fst;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.lucene.store.DataInput;
import com.google.common.io.ByteStreams;
/**
* A {@link DataInput} wrapping a plain {@link InputStream}.
*/
public class InputStreamDataInput extends DataInput {
private final InputStream is;
public InputStreamDataInput(InputStream is) {
this.is = is;
}
@Override
public byte readByte() throws IOException {
int v = is.read();
if (v == -1) throw new EOFException();
return (byte) v;
}
@Override
public void readBytes(byte[] b, int offset, int len) throws IOException {
ByteStreams.readFully(is, b, offset, len);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.spelling.suggest.fst;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.lucene.store.DataOutput;
/**
* A {@link DataOutput} wrapping a plain {@link OutputStream}.
*/
public class OutputStreamDataOutput extends DataOutput {
private final OutputStream os;
public OutputStreamDataOutput(OutputStream os) {
this.os = os;
}
@Override
public void writeByte(byte b) throws IOException {
os.write(b);
}
@Override
public void writeBytes(byte[] b, int offset, int length) throws IOException {
os.write(b, offset, length);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -31,7 +31,7 @@
<requestHandler name="standard" class="solr.StandardRequestHandler" /> <requestHandler name="standard" class="solr.StandardRequestHandler" />
<!-- Suggest component --> <!-- Suggest component -->
<searchComponent class="solr.SpellCheckComponent" name="suggest"> <searchComponent class="solr.SpellCheckComponent" name="suggest_jaspell">
<lst name="spellchecker"> <lst name="spellchecker">
<str name="name">suggest</str> <str name="name">suggest</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str> <str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
@ -45,6 +45,38 @@
</lst> </lst>
</searchComponent> </searchComponent>
<!-- TSTLookup suggest component -->
<searchComponent class="solr.SpellCheckComponent" name="suggest_tst">
<lst name="spellchecker">
<str name="name">suggest_tst</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
<str name="lookupImpl">org.apache.solr.spelling.suggest.tst.TSTLookup</str>
<str name="field">suggest</str>
<str name="storeDir">suggest_tst</str>
<str name="buildOnCommit">true</str>
<!-- Suggester properties -->
<float name="threshold">0.0</float>
</lst>
</searchComponent>
<!-- FSTLookup suggest component -->
<searchComponent class="solr.SpellCheckComponent" name="suggest_fst">
<lst name="spellchecker">
<str name="name">suggest_fst</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.FSTLookup</str>
<str name="field">suggest</str>
<str name="storeDir">suggest_fst</str>
<str name="buildOnCommit">true</str>
<!-- Suggester properties -->
<int name="weightBuckets">5</int>
<bool name="exactMatchFirst">true</bool>
</lst>
</searchComponent>
<!-- The default (jaspell) -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest"> <requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest">
<lst name="defaults"> <lst name="defaults">
<str name="spellcheck">true</str> <str name="spellcheck">true</str>
@ -52,7 +84,31 @@
<str name="spellcheck.collate">true</str> <str name="spellcheck.collate">true</str>
</lst> </lst>
<arr name="components"> <arr name="components">
<str>suggest</str> <str>suggest_jaspell</str>
</arr>
</requestHandler>
<!-- tst (ternary tree based) -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_tst">
<lst name="defaults">
<str name="spellcheck">true</str>
<str name="spellcheck.dictionary">suggest_tst</str>
<str name="spellcheck.collate">true</str>
</lst>
<arr name="components">
<str>suggest_tst</str>
</arr>
</requestHandler>
<!-- fst (finite state automaton based) -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_fst">
<lst name="defaults">
<str name="spellcheck">true</str>
<str name="spellcheck.dictionary">suggest_fst</str>
<str name="spellcheck.collate">false</str>
</lst>
<arr name="components">
<str>suggest_fst</str>
</arr> </arr>
</requestHandler> </requestHandler>

View File

@ -0,0 +1,52 @@
package org.apache.solr.spelling.suggest;
import java.util.List;
import java.util.Locale;
/**
* Average with standard deviation.
*/
final class Average
{
/**
* Average (in milliseconds).
*/
public final double avg;
/**
* Standard deviation (in milliseconds).
*/
public final double stddev;
/**
*
*/
Average(double avg, double stddev)
{
this.avg = avg;
this.stddev = stddev;
}
public String toString()
{
return String.format(Locale.ENGLISH, "%.0f [+- %.2f]",
avg, stddev);
}
static Average from(List<Double> values)
{
double sum = 0;
double sumSquares = 0;
for (double l : values)
{
sum += l;
sumSquares += l * l;
}
double avg = sum / (double) values.size();
return new Average(
(sum / (double) values.size()),
Math.sqrt(sumSquares / (double) values.size() - avg * avg));
}
}

View File

@ -0,0 +1,230 @@
package org.apache.solr.spelling.suggest;
import java.net.URL;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import java.util.concurrent.Callable;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.solr.spelling.suggest.fst.FSTLookup;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
/**
* Benchmarks tests for implementations of {@link Lookup} interface.
*/
@Ignore // COMMENT ME TO RUN BENCHMARKS!
public class LookupBenchmarkTest {
@SuppressWarnings("unchecked")
private final List<Class<? extends Lookup>> benchmarkClasses = Lists.newArrayList(
JaspellLookup.class,
TSTLookup.class,
FSTLookup.class);
private final static int rounds = 15;
private final static int warmup = 5;
private final int num = 7;
private final boolean onlyMorePopular = true;
private final static Random random = new Random(0xdeadbeef);
/**
* Input term/weight pairs.
*/
private static TermFreq [] dictionaryInput;
/**
* Benchmark term/weight pairs (randomized order).
*/
private static List<TermFreq> benchmarkInput;
/**
* Loads terms and frequencies from Wikipedia (cached).
*/
@BeforeClass
public static void setup() throws Exception {
List<TermFreq> input = readTop50KWiki();
Collections.shuffle(input, random);
LookupBenchmarkTest.dictionaryInput = input.toArray(new TermFreq [input.size()]);
Collections.shuffle(input, random);
LookupBenchmarkTest.benchmarkInput = input;
}
/**
* Collect the multilingual input for benchmarks/ tests.
*/
public static List<TermFreq> readTop50KWiki() throws Exception {
List<TermFreq> input = Lists.newArrayList();
URL resource = Thread.currentThread().getContextClassLoader().getResource("Top50KWiki.utf8");
assert resource != null : "Resource missing: Top50KWiki.utf8";
for (String line : Resources.readLines(resource, Charsets.UTF_8)) {
int tab = line.indexOf('|');
Assert.assertTrue("No | separator?: " + line, tab >= 0);
float weight = Float.parseFloat(line.substring(tab + 1));
String key = line.substring(0, tab);
input.add(new TermFreq(key, weight));
}
return input;
}
/**
* Test construction time.
*/
@Test
public void testConstructionTime() throws Exception {
System.err.println("-- construction time");
for (final Class<? extends Lookup> cls : benchmarkClasses) {
BenchmarkResult result = measure(new Callable<Integer>() {
public Integer call() throws Exception {
final Lookup lookup = buildLookup(cls, dictionaryInput);
return lookup.hashCode();
}
});
System.err.println(
String.format(Locale.ENGLISH, "%-15s input: %d, time[ms]: %s",
cls.getSimpleName(),
dictionaryInput.length,
result.average.toString()));
}
}
/**
* Test memory required for the storage.
*/
@Test
public void testStorageNeeds() throws Exception {
System.err.println("-- RAM consumption");
final RamUsageEstimator rue = new RamUsageEstimator();
for (Class<? extends Lookup> cls : benchmarkClasses) {
Lookup lookup = buildLookup(cls, dictionaryInput);
System.err.println(
String.format(Locale.ENGLISH, "%-15s size[B]:%,13d",
lookup.getClass().getSimpleName(),
rue.estimateRamUsage(lookup)));
}
}
/**
* Create {@link Lookup} instance and populate it.
*/
private Lookup buildLookup(Class<? extends Lookup> cls, TermFreq[] input) throws Exception {
Lookup lookup = cls.newInstance();
lookup.build(new TermFreqArrayIterator(input));
return lookup;
}
/**
* Test performance of lookup on full hits.
*/
@Test
public void testPerformanceOnFullHits() throws Exception {
final int minPrefixLen = 100;
final int maxPrefixLen = 200;
runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
}
/**
* Test performance of lookup on longer term prefixes (6-9 letters or shorter).
*/
@Test
public void testPerformanceOnPrefixes6_9() throws Exception {
final int minPrefixLen = 6;
final int maxPrefixLen = 9;
runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
}
/**
* Test performance of lookup on short term prefixes (2-4 letters or shorter).
*/
@Test
public void testPerformanceOnPrefixes2_4() throws Exception {
final int minPrefixLen = 2;
final int maxPrefixLen = 4;
runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
}
/**
* Run the actual benchmark.
*/
public void runPerformanceTest(final int minPrefixLen, final int maxPrefixLen,
final int num, final boolean onlyMorePopular) throws Exception {
System.err.println(String.format(Locale.ENGLISH,
"-- prefixes: %d-%d, num: %d, onlyMorePopular: %s",
minPrefixLen, maxPrefixLen, num, onlyMorePopular));
for (Class<? extends Lookup> cls : benchmarkClasses) {
final Lookup lookup = buildLookup(cls, dictionaryInput);
final List<String> input = Lists.newArrayList(Iterables.transform(benchmarkInput, new Function<TermFreq, String>() {
public String apply(TermFreq tf) {
return tf.term.substring(0, Math.min(tf.term.length(),
minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)));
}
}));
BenchmarkResult result = measure(new Callable<Integer>() {
public Integer call() throws Exception {
int v = 0;
for (String term : input) {
v += lookup.lookup(term, onlyMorePopular, num).size();
}
return v;
}
});
System.err.println(
String.format(Locale.ENGLISH, "%-15s queries: %d, time[ms]: %s, ~qps: %.0f",
lookup.getClass().getSimpleName(),
input.size(),
result.average.toString(),
input.size() / result.average.avg));
}
}
/**
* Do the measurements.
*/
private BenchmarkResult measure(Callable<Integer> callable) {
final double NANOS_PER_MS = 1000000;
try {
List<Double> times = Lists.newArrayList();
for (int i = 0; i < warmup + rounds; i++) {
final long start = System.nanoTime();
guard = callable.call().intValue();
times.add((System.nanoTime() - start) / NANOS_PER_MS);
}
return new BenchmarkResult(times, warmup, rounds);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/** Guard against opts. */
@SuppressWarnings("unused")
private static volatile int guard;
private static class BenchmarkResult {
/** Average time per round (ms). */
public final Average average;
public BenchmarkResult(List<Double> times, int warmup, int rounds) {
this.average = Average.from(times.subList(warmup, times.size()));
}
}
}

View File

@ -19,13 +19,13 @@ package org.apache.solr.spelling.suggest;
import java.io.File; import java.io.File;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.spelling.suggest.fst.FSTLookup;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup; import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup; import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.junit.Test; import org.junit.Test;
public class PersistenceTest extends SolrTestCaseJ4 { public class PersistenceTest extends SolrTestCaseJ4 {
public final String[] keys = new String[] {
public static final String[] keys = new String[] {
"one", "one",
"two", "two",
"three", "three",
@ -40,41 +40,53 @@ public class PersistenceTest extends SolrTestCaseJ4 {
"threat", "threat",
"foundation", "foundation",
"fourier", "fourier",
"fourty" "fourty"};
};
@Test @Test
public void testTSTPersistence() throws Exception { public void testTSTPersistence() throws Exception {
TSTLookup lookup = new TSTLookup(); runTest(TSTLookup.class, true);
for (String k : keys) {
lookup.add(k, new Float(k.length()));
}
File storeDir = new File(TEST_HOME());
lookup.store(storeDir);
lookup = new TSTLookup();
lookup.load(storeDir);
for (String k : keys) {
Float val = (Float)lookup.get(k);
assertNotNull(k, val);
assertEquals(k, k.length(), val.intValue());
}
} }
@Test @Test
public void testJaspellPersistence() throws Exception { public void testJaspellPersistence() throws Exception {
JaspellLookup lookup = new JaspellLookup(); runTest(JaspellLookup.class, true);
for (String k : keys) {
lookup.add(k, new Float(k.length()));
}
File storeDir = new File(TEST_HOME());
lookup.store(storeDir);
lookup = new JaspellLookup();
lookup.load(storeDir);
for (String k : keys) {
Float val = (Float)lookup.get(k);
assertNotNull(k, val);
assertEquals(k, k.length(), val.intValue());
}
} }
@Test
public void testFSTPersistence() throws Exception {
runTest(FSTLookup.class, false);
}
private void runTest(Class<? extends Lookup> lookupClass,
boolean supportsExactWeights) throws Exception {
// Add all input keys.
Lookup lookup = lookupClass.newInstance();
TermFreq[] keys = new TermFreq[this.keys.length];
for (int i = 0; i < keys.length; i++)
keys[i] = new TermFreq(this.keys[i], (float) i);
lookup.build(new TermFreqArrayIterator(keys));
// Store the suggester.
File storeDir = new File(TEST_HOME());
lookup.store(storeDir);
// Re-read it from disk.
lookup = lookupClass.newInstance();
lookup.load(storeDir);
// Assert validity.
float previous = Float.NEGATIVE_INFINITY;
for (TermFreq k : keys) {
Float val = (Float) lookup.get(k.term);
assertNotNull(k.term, val);
if (supportsExactWeights) {
assertEquals(k.term, Float.valueOf(k.v), val);
} else {
assertTrue(val + ">=" + previous, val >= previous);
previous = val.floatValue();
}
}
}
} }

View File

@ -0,0 +1,7 @@
package org.apache.solr.spelling.suggest;
public class SuggesterFSTTest extends SuggesterTest {
public SuggesterFSTTest() {
super.requestUri = "/suggest_fst";
}
}

View File

@ -0,0 +1,7 @@
package org.apache.solr.spelling.suggest;
public class SuggesterTSTTest extends SuggesterTest {
public SuggesterTSTTest() {
super.requestUri = "/suggest_tst";
}
}

View File

@ -17,28 +17,19 @@
package org.apache.solr.spelling.suggest; package org.apache.solr.spelling.suggest;
import org.apache.lucene.util.RamUsageEstimator; import java.io.File;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.SpellingParams; import org.apache.solr.common.params.SpellingParams;
import org.apache.solr.spelling.suggest.Lookup.LookupResult;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.apache.solr.util.TermFreqIterator;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import com.google.common.collect.Lists;
import java.io.File;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
public class SuggesterTest extends SolrTestCaseJ4 { public class SuggesterTest extends SolrTestCaseJ4 {
/**
* Expected URI at which the given suggester will live.
*/
protected String requestUri = "/suggest";
@BeforeClass @BeforeClass
public static void beforeClass() throws Exception { public static void beforeClass() throws Exception {
initCore("solrconfig-spellchecker.xml","schema-spellchecker.xml"); initCore("solrconfig-spellchecker.xml","schema-spellchecker.xml");
@ -59,10 +50,9 @@ public class SuggesterTest extends SolrTestCaseJ4 {
@Test @Test
public void testSuggestions() throws Exception { public void testSuggestions() throws Exception {
addDocs(); addDocs();
assertU(commit()); // configured to do a rebuild on commit assertU(commit()); // configured to do a rebuild on commit
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"), assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']", "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']", "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']" "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']"
@ -82,7 +72,7 @@ public class SuggesterTest extends SolrTestCaseJ4 {
dataDir = data; dataDir = data;
configString = config; configString = config;
initCore(); initCore();
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"), assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']", "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']", "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']" "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']"
@ -96,132 +86,13 @@ public class SuggesterTest extends SolrTestCaseJ4 {
public void testRebuild() throws Exception { public void testRebuild() throws Exception {
addDocs(); addDocs();
assertU(commit()); assertU(commit());
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"), assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']"); "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
assertU(adoc("id", "4", assertU(adoc("id", "4",
"text", "actually" "text", "actually"
)); ));
assertU(commit()); assertU(commit());
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"), assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']"); "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
} }
private TermFreqIterator getTFIT() {
final int count = 100000;
TermFreqIterator tfit = new TermFreqIterator() {
Random r = new Random(1234567890L);
Random r1 = new Random(1234567890L);
int pos;
public float freq() {
return r1.nextInt(4);
}
public boolean hasNext() {
return pos < count;
}
public String next() {
pos++;
return Long.toString(r.nextLong());
}
public void remove() {
throw new UnsupportedOperationException();
}
};
return tfit;
}
static class Bench {
long buildTime;
long lookupTime;
}
@Test @Ignore
public void testBenchmark() throws Exception {
final List<Class<? extends Lookup>> benchmarkClasses = Lists.newArrayList();
benchmarkClasses.add(JaspellLookup.class);
benchmarkClasses.add(TSTLookup.class);
// Run a single pass just to see if everything works fine and provide size estimates.
final RamUsageEstimator rue = new RamUsageEstimator();
for (Class<? extends Lookup> cls : benchmarkClasses) {
Lookup lookup = singleBenchmark(cls, null);
System.err.println(
String.format(Locale.ENGLISH,
"%20s, size[B]=%,d",
lookup.getClass().getSimpleName(),
rue.estimateRamUsage(lookup)));
}
int warmupCount = 10;
int measuredCount = 100;
for (Class<? extends Lookup> cls : benchmarkClasses) {
Bench b = fullBenchmark(cls, warmupCount, measuredCount);
System.err.println(String.format(Locale.ENGLISH,
"%s: buildTime[ms]=%,d lookupTime[ms]=%,d",
cls.getSimpleName(),
(b.buildTime / measuredCount),
(b.lookupTime / measuredCount / 1000000)));
}
}
private Lookup singleBenchmark(Class<? extends Lookup> cls, Bench bench) throws Exception {
Lookup lookup = cls.newInstance();
long start = System.currentTimeMillis();
lookup.build(getTFIT());
long buildTime = System.currentTimeMillis() - start;
TermFreqIterator tfit = getTFIT();
long elapsed = 0;
while (tfit.hasNext()) {
String key = tfit.next();
// take only the first part of the key
int len = key.length() > 4 ? key.length() / 3 : 2;
String prefix = key.substring(0, len);
start = System.nanoTime();
List<LookupResult> res = lookup.lookup(prefix, true, 10);
elapsed += System.nanoTime() - start;
assertTrue(res.size() > 0);
for (LookupResult lr : res) {
assertTrue(lr.key.startsWith(prefix));
}
}
if (bench != null) {
bench.buildTime += buildTime;
bench.lookupTime += elapsed;
}
return lookup;
}
private Bench fullBenchmark(Class<? extends Lookup> cls, int warmupCount, int measuredCount) throws Exception {
System.err.println("* Running " + measuredCount + " iterations for " + cls.getSimpleName() + " ...");
System.err.println(" - warm-up " + warmupCount + " iterations...");
for (int i = 0; i < warmupCount; i++) {
System.runFinalization();
System.gc();
singleBenchmark(cls, null);
}
Bench b = new Bench();
System.err.print(" - main iterations:"); System.err.flush();
for (int i = 0; i < measuredCount; i++) {
System.runFinalization();
System.gc();
singleBenchmark(cls, b);
if (i > 0 && (i % 10 == 0)) {
System.err.print(" " + i);
System.err.flush();
}
}
System.err.println();
return b;
}
} }

View File

@ -0,0 +1,11 @@
package org.apache.solr.spelling.suggest;
public final class TermFreq {
public final String term;
public final float v;
public TermFreq(String term, float v) {
this.term = term;
this.v = v;
}
}

View File

@ -0,0 +1,40 @@
package org.apache.solr.spelling.suggest;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.solr.util.TermFreqIterator;
/**
* A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
*/
public final class TermFreqArrayIterator implements TermFreqIterator {
private final Iterator<TermFreq> i;
private TermFreq current;
public TermFreqArrayIterator(Iterator<TermFreq> i) {
this.i = i;
}
public TermFreqArrayIterator(TermFreq [] i) {
this(Arrays.asList(i));
}
public TermFreqArrayIterator(Iterable<TermFreq> i) {
this(i.iterator());
}
public float freq() {
return current.v;
}
public boolean hasNext() {
return i.hasNext();
}
public String next() {
return (current = i.next()).term;
}
public void remove() { throw new UnsupportedOperationException(); }
}

View File

@ -0,0 +1,155 @@
package org.apache.solr.spelling.suggest.fst;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.spelling.suggest.Lookup.LookupResult;
import org.apache.solr.spelling.suggest.LookupBenchmarkTest;
import org.apache.solr.spelling.suggest.TermFreq;
import org.apache.solr.spelling.suggest.TermFreqArrayIterator;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import com.google.common.collect.Lists;
/**
* Unit tests for {@link FSTLookup}.
*/
public class FSTLookupTest extends LuceneTestCase {
public static TermFreq tf(String t, float v) {
return new TermFreq(t, v);
}
private FSTLookup lookup;
@Before
public void prepare() throws Exception {
final TermFreq[] keys = new TermFreq[] {
tf("one", 0.5f),
tf("oneness", 1),
tf("onerous", 1),
tf("onesimus", 1),
tf("two", 1),
tf("twofold", 1),
tf("twonk", 1),
tf("thrive", 1),
tf("through", 1),
tf("threat", 1),
tf("three", 1),
tf("foundation", 1),
tf("fourier", 1),
tf("four", 1),
tf("fourty", 1),
tf("xo", 1),
};
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(keys));
}
@Test
public void testExactMatchHighPriority() throws Exception {
assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0");
}
@Test
public void testExactMatchLowPriority() throws Exception {
assertMatchEquals(lookup.lookup("one", true, 2),
"one/0.0",
"oneness/1.0");
}
@Test
public void testMiss() throws Exception {
assertMatchEquals(lookup.lookup("xyz", true, 1));
}
@Test
public void testAlphabeticWithWeights() throws Exception {
assertEquals(0, lookup.lookup("xyz", false, 1).size());
}
@Test
public void testFullMatchList() throws Exception {
assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE),
"oneness/1.0",
"onerous/1.0",
"onesimus/1.0",
"one/0.0");
}
@Test
public void testMultilingualInput() throws Exception {
List<TermFreq> input = LookupBenchmarkTest.readTop50KWiki();
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(input));
for (TermFreq tf : input) {
assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null);
assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key);
}
}
@Test
public void testEmptyInput() throws Exception {
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(new TermFreq[0]));
assertMatchEquals(lookup.lookup("", true, 10));
}
@Test
public void testRandom() throws Exception {
List<TermFreq> freqs = Lists.newArrayList();
Random rnd = random;
for (int i = 0; i < 5000; i++) {
freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100)));
}
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()])));
for (TermFreq tf : freqs) {
final String term = tf.term;
for (int i = 1; i < term.length(); i++) {
String prefix = term.substring(0, i);
for (LookupResult lr : lookup.lookup(prefix, true, 10)) {
Assert.assertTrue(lr.key.startsWith(prefix));
}
}
}
}
private void assertMatchEquals(List<LookupResult> res, String... expected) {
String [] result = new String [res.size()];
for (int i = 0; i < res.size(); i++)
result[i] = res.get(i).toString();
if (!Arrays.equals(expected, result)) {
int colLen = Math.max(maxLen(expected), maxLen(result));
StringBuilder b = new StringBuilder();
String format = "%" + colLen + "s " + "%" + colLen + "s\n";
b.append(String.format(Locale.ENGLISH, format, "Expected", "Result"));
for (int i = 0; i < Math.max(result.length, expected.length); i++) {
b.append(String.format(Locale.ENGLISH, format,
i < expected.length ? expected[i] : "--",
i < result.length ? result[i] : "--"));
}
System.err.println(b.toString());
fail("Expected different output:\n" + b.toString());
}
}
private int maxLen(String[] result) {
int len = 0;
for (String s : result)
len = Math.max(len, s.length());
return len;
}
}