SOLR-2378: A new, automaton-based, implementation of suggest (autocomplete)

component, offering an order of magnitude smaller memory consumption
compared to ternary trees and jaspell and very fast lookups at runtime.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1092136 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2011-04-14 11:16:43 +00:00
parent 2133423e2b
commit 191706df70
17 changed files with 51252 additions and 191 deletions

View File

@ -61,6 +61,11 @@ Detailed Change List
New Features
----------------------
* SOLR-2378: A new, automaton-based, implementation of suggest (autocomplete)
component, offering an order of magnitude smaller memory consumption
compared to ternary trees and jaspell and very fast lookups at runtime.
(Dawid Weiss)
* SOLR-571: The autowarmCount for LRUCaches (LRUCache and FastLRUCache) now
supports "percentages" which get evaluated relative the current size of
the cache when warming happens.

View File

@ -162,7 +162,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
} else {
throw new SolrException(SolrException.ErrorCode.NOT_FOUND,
"Specified dictionary does not exist.");
"Specified dictionary does not exist: " + getDictionaryName(params));
}
}
}

View File

@ -12,7 +12,6 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.util.TermFreqIterator;
public abstract class Lookup {
/**
* Result of a lookup.
*/

View File

@ -0,0 +1,556 @@
package org.apache.solr.spelling.suggest.fst;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.fst.Builder;
import org.apache.lucene.util.automaton.fst.FST;
import org.apache.lucene.util.automaton.fst.FST.Arc;
import org.apache.lucene.util.automaton.fst.NoOutputs;
import org.apache.lucene.util.automaton.fst.Outputs;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.spelling.suggest.Lookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.apache.solr.util.TermFreqIterator;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
/**
* Finite state automata based implementation of {@link Lookup} query
* suggestion/ autocomplete interface.
*
* <h2>Implementation details</h2>
*
* <p>The construction step in {@link #build(TermFreqIterator)} works as follows:
* <ul>
* <li>A set of input terms (String) and weights (float) is given.</li>
* <li>The range of weights is determined and then all weights are discretized into a fixed set
* of values ({@link #buckets}).
* Note that this means that minor changes in weights may be lost during automaton construction.
* In general, this is not a big problem because the "priorities" of completions can be split
* into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
* If you need exact, fine-grained weights, use {@link TSTLookup} instead.<li>
* <li>All terms in the input are preprended with a synthetic pseudo-character being the weight
* of that term. For example a term <code>abc</code> with a discretized weight equal '1' would
* become <code>1abc</code>.</li>
* <li>The terms are sorted by their raw value of utf16 character values (including the synthetic
* term in front).</li>
* <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
* arcs labeled with all possible weights. We cache all these arcs, highest-weight first.</li>
* </ul>
*
* <p>At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
* <ul>
* <li>For each possible term weight encoded in the automaton (cached arcs from the root above),
* starting with the highest one, we descend along the path of the input key. If the key is not
* a prefix of a sequence in the automaton (path ends prematurely), we exit immediately.
* No completions.
* <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire
* subautomaton (all paths) starting from this node form the key's completions.</b> We start
* the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
* suggestion to the list of results (the weight of this suggestion is constant and equal to the
* root path we started from). The tricky part is that because automaton edges are sorted and
* we scan depth-first, we can terminate the entire procedure as soon as we collect enough
* suggestions the user requested.
* <li>In case the number of suggestions collected in the step above is still insufficient,
* we proceed to the next (smaller) weight leaving the root node and repeat the same
* algorithm again.
* </li>
* </ul>
*
* <h2>Runtime behavior and performance characteristic</h2>
*
* <p>The algorithm described above is optimized for finding suggestions to short prefixes
* in a top-weights-first order. This is probably the most common use case: it allows
* presenting suggestions early and sorts them by the global frequency (and then alphabetically).
*
* <p>If there is an exact match in the automaton, it is returned first on the results
* list (even with by-weight sorting).
*
* <p>Note that the maximum lookup time for <b>any prefix</b>
* is the time of descending to the subtree, plus traversal of the subtree up to the number
* of requested suggestions (because they are already presorted by weight on the root level
* and alphabetically at any node level).
*
* <p>To order alphabetically only (no ordering by priorities), use identical term weights
* for all terms. Alphabetical suggestions are returned even if non-constant weights are
* used, but the algorithm for doing this is suboptimal.
*
* <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
* nothing else.
*/
public class FSTLookup extends Lookup {
/** A structure for a single entry (for sorting/ preprocessing). */
private static class Entry {
char [] term;
float weight;
public Entry(char [] term, float freq) {
this.term = term;
this.weight = freq;
}
}
/**
* The number of separate buckets for weights (discretization). The more buckets,
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
* will not decrease for prefixes which have highly-weighted completions (because these
* are filled-in first), but will decrease significantly for low-weighted terms (but
* these should be infrequent, so it is all right).
*
* <p>The number of buckets must be within [1, 255] range.
*/
public static final String WEIGHT_BUCKETS = "weightBuckets";
/**
* If <code>true</code>, exact suggestions are returned first, even if they are prefixes
* of other strings in the automaton (possibly with larger weights).
*/
public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
/** Serialized automaton file name (storage). */
public static final String FILENAME = "fst.dat";
/** An empty result. */
private static final List<LookupResult> EMPTY_RESULT = Lists.newArrayList();
/**
* @see #WEIGHT_BUCKETS
*/
private int buckets = 10;
/**
* #see #EXACT_MATCH_FIRST
*/
private boolean exactMatchFirst = true;
/**
* Finite state automaton encoding all the lookup terms. See class
* notes for details.
*/
private FST<Object> automaton;
/**
* An array of arcs leaving the root automaton state and encoding weights of all
* completions in their sub-trees.
*/
private Arc<Object> [] rootArcs;
/* */
@Override
@SuppressWarnings("rawtypes")
public void init(NamedList config, SolrCore core) {
this.buckets = config.get(WEIGHT_BUCKETS) != null
? Integer.parseInt(config.get(WEIGHT_BUCKETS).toString())
: 10;
this.exactMatchFirst = config.get(EXACT_MATCH_FIRST) != null
? Boolean.valueOf(config.get(EXACT_MATCH_FIRST).toString())
: true;
}
/* */
@Override
public void build(TermFreqIterator tfit) throws IOException {
// Buffer the input because we will need it twice: for calculating
// weights distribution and for the actual automata building.
List<Entry> entries = Lists.newArrayList();
while (tfit.hasNext()) {
String term = tfit.next();
char [] termChars = new char [term.length() + 1]; // add padding for weight.
for (int i = 0; i < term.length(); i++)
termChars[i + 1] = term.charAt(i);
entries.add(new Entry(termChars, tfit.freq()));
}
// Distribute weights into at most N buckets. This is a form of discretization to
// limit the number of possible weights so that they can be efficiently encoded in the
// automaton.
//
// It is assumed the distribution of weights is _linear_ so proportional division
// of [min, max] range will be enough here. Other approaches could be to sort
// weights and divide into proportional ranges.
if (entries.size() > 0) {
redistributeWeightsProportionalMinMax(entries, buckets);
encodeWeightPrefix(entries);
}
// Build the automaton (includes input sorting) and cache root arcs in order from the highest,
// to the lowest weight.
this.automaton = buildAutomaton(entries);
cacheRootArcs();
}
/**
* Cache the root node's output arcs starting with completions with the highest weights.
*/
@SuppressWarnings("unchecked")
private void cacheRootArcs() throws IOException {
if (automaton != null) {
List<Arc<Object>> rootArcs = Lists.newArrayList();
Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
automaton.readFirstTargetArc(arc, arc);
while (true) {
rootArcs.add(new Arc<Object>().copyFrom(arc));
if (arc.isLast())
break;
automaton.readNextArc(arc);
}
Collections.reverse(rootArcs); // we want highest weights first.
this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]);
}
}
/**
* Not implemented.
*/
@Override
public boolean add(String key, Object value) {
// This implementation does not support ad-hoc additions (all input
// must be sorted for the builder).
return false;
}
/**
* Get the (approximated) weight of a single key (if there is a perfect match
* for it in the automaton).
*
* @return Returns the approximated weight of the input key or <code>null</code>
* if not found.
*/
@Override
public Float get(String key) {
return getExactMatchStartingFromRootArc(0, key);
}
/**
* Returns the first exact match by traversing root arcs, starting from
* the arc <code>i</code>.
*
* @param i The first root arc index in {@link #rootArcs} to consider when
* matching.
*/
private Float getExactMatchStartingFromRootArc(int i, String key) {
// Get the UTF-8 bytes representation of the input key.
try {
final FST.Arc<Object> scratch = new FST.Arc<Object>();
for (; i < rootArcs.length; i++) {
final FST.Arc<Object> rootArc = rootArcs[i];
final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
// Descend into the automaton using the key as prefix.
if (descendWithPrefix(arc, key)) {
automaton.readFirstTargetArc(arc, arc);
if (arc.label == FST.END_LABEL) {
// Prefix-encoded weight.
return rootArc.label / (float) buckets;
}
}
}
} catch (IOException e) {
// Should never happen, but anyway.
throw new RuntimeException(e);
}
return null;
}
/**
* Lookup autocomplete suggestions to <code>key</code>.
*
* @param key The prefix to which suggestions should be sought.
* @param onlyMorePopular Return most popular suggestions first. This is the default
* behavior for this implementation. Setting it to <code>false</code> has no effect (use
* constant term weights to sort alphabetically only).
* @param num At most this number of suggestions will be returned.
* @return Returns the suggestions, sorted by their approximated weight first (decreasing)
* and then alphabetically (utf16 codepoint order).
*/
@Override
public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) {
if (key.length() == 0 || automaton == null) {
// Keep the result an ArrayList to keep calls monomorphic.
return EMPTY_RESULT;
}
try {
if (!onlyMorePopular && rootArcs.length > 1) {
// We could emit a warning here (?). An optimal strategy for alphabetically sorted
// suggestions would be to add them with a constant weight -- this saves unnecessary
// traversals and sorting.
return lookupSortedAlphabetically(key, num);
} else {
return lookupSortedByWeight(key, num, true);
}
} catch (IOException e) {
// Should never happen, but anyway.
throw new RuntimeException(e);
}
}
/**
* Lookup suggestions sorted alphabetically <b>if weights are not constant</b>. This
* is a workaround: in general, use constant weights for alphabetically sorted result.
*/
private List<LookupResult> lookupSortedAlphabetically(String key, int num) throws IOException {
// Greedily get num results from each weight branch.
List<LookupResult> res = lookupSortedByWeight(key, num, false);
// Sort and trim.
Collections.sort(res, new Comparator<LookupResult>() {
@Override
public int compare(LookupResult o1, LookupResult o2) {
return o1.key.compareTo(o2.key);
}
});
if (res.size() > num) {
res = res.subList(0, num);
}
return res;
}
/**
* Lookup suggestions sorted by weight (descending order).
*
* @param greedy If <code>true</code>, the routine terminates immediately when <code>num</code>
* suggestions have been collected. If <code>false</code>, it will collect suggestions from
* all weight arcs (needed for {@link #lookupSortedAlphabetically}.
*/
private ArrayList<LookupResult> lookupSortedByWeight(String key, int num, boolean greedy) throws IOException {
final ArrayList<LookupResult> res = new ArrayList<LookupResult>(Math.min(10, num));
final StringBuilder output = new StringBuilder(key);
final int matchLength = key.length() - 1;
for (int i = 0; i < rootArcs.length; i++) {
final FST.Arc<Object> rootArc = rootArcs[i];
final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
// Descend into the automaton using the key as prefix.
if (descendWithPrefix(arc, key)) {
// Prefix-encoded weight.
final float weight = rootArc.label / (float) buckets;
// A subgraph starting from the current node has the completions
// of the key prefix. The arc we're at is the last key's byte,
// so we will collect it too.
output.setLength(matchLength);
if (collect(res, num, weight, output, arc) && greedy) {
// We have enough suggestion to return immediately. Keep on looking for an
// exact match, if requested.
if (exactMatchFirst) {
Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key);
if (exactMatchWeight != null) {
res.add(0, new LookupResult(key, exactMatchWeight));
while (res.size() > num) {
res.remove(res.size() - 1);
}
}
}
break;
}
}
}
return res;
}
/**
* Descend along the path starting at <code>arc</code> and going through
* bytes in <code>utf8</code> argument.
*
* @param arc The starting arc. This argument is modified in-place.
* @param term The term to descend with.
* @return If <code>true</code>, <code>arc</code> will be set to the arc matching
* last byte of <code>utf8</code>. <code>false</code> is returned if no such
* prefix <code>utf8</code> exists.
*/
private boolean descendWithPrefix(Arc<Object> arc, String term) throws IOException {
final int max = term.length();
for (int i = 0; i < max; i++) {
if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) {
// No matching prefixes, return an empty result.
return false;
}
}
return true;
}
/**
* Recursive collect lookup results from the automaton subgraph starting at <code>arc</code>.
*
* @param num Maximum number of results needed (early termination).
* @param weight Weight of all results found during this collection.
*/
private boolean collect(List<LookupResult> res, int num, float weight, StringBuilder output, Arc<Object> arc) throws IOException {
output.append((char) arc.label);
automaton.readFirstTargetArc(arc, arc);
while (true) {
if (arc.label == FST.END_LABEL) {
res.add(new LookupResult(output.toString(), weight));
if (res.size() >= num)
return true;
} else {
int save = output.length();
if (collect(res, num, weight, output, new Arc<Object>().copyFrom(arc))) {
return true;
}
output.setLength(save);
}
if (arc.isLast()) {
break;
}
automaton.readNextArc(arc);
}
return false;
}
/**
* Builds the final automaton from a list of entries.
*/
private FST<Object> buildAutomaton(List<Entry> entries) throws IOException {
if (entries.size() == 0)
return null;
// Sort by utf16 (raw char value)
final Comparator<Entry> comp = new Comparator<Entry>() {
public int compare(Entry o1, Entry o2) {
char [] ch1 = o1.term;
char [] ch2 = o2.term;
int len1 = ch1.length;
int len2 = ch2.length;
int max = Math.min(len1, len2);
for (int i = 0; i < max; i++) {
int v = ch1[i] - ch2[i];
if (v != 0) return v;
}
return len1 - len2;
}
};
Collections.sort(entries, comp);
// Avoid duplicated identical entries, if possible. This is required because
// it breaks automaton construction otherwise.
int len = entries.size();
int j = 0;
for (int i = 1; i < len; i++) {
if (comp.compare(entries.get(j), entries.get(i)) != 0) {
entries.set(++j, entries.get(i));
}
}
entries = entries.subList(0, j + 1);
// Build the automaton.
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object empty = outputs.getNoOutput();
final Builder<Object> builder =
new Builder<Object>(FST.INPUT_TYPE.BYTE4, 0, 0, true, outputs);
final IntsRef scratchIntsRef = new IntsRef(10);
for (Entry e : entries) {
final int termLength = scratchIntsRef.length = e.term.length;
scratchIntsRef.grow(termLength);
final int [] ints = scratchIntsRef.ints;
final char [] chars = e.term;
for (int i = termLength; --i >= 0;) {
ints[i] = chars[i];
}
builder.add(scratchIntsRef, empty);
}
return builder.finish();
}
/**
* Prepends the entry's weight to each entry, encoded as a single byte, so that the
* root automaton node fans out to all possible priorities, starting with the arc that has
* the highest weights.
*/
private void encodeWeightPrefix(List<Entry> entries) {
for (Entry e : entries) {
int weight = (int) e.weight;
assert (weight >= 0 && weight <= buckets) :
"Weight out of range: " + weight + " [" + buckets + "]";
// There should be a single empty char reserved in front for the weight.
e.term[0] = (char) weight;
}
}
/**
* Split [min, max] range into buckets, reassigning weights. Entries' weights are
* remapped to [0, buckets] range (so, buckets + 1 buckets, actually).
*/
private void redistributeWeightsProportionalMinMax(List<Entry> entries, int buckets) {
float min = entries.get(0).weight;
float max = min;
for (Entry e : entries) {
min = Math.min(e.weight, min);
max = Math.max(e.weight, max);
}
final float range = max - min;
for (Entry e : entries) {
e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor()
}
}
/**
* Deserialization from disk.
*/
@Override
public synchronized boolean load(File storeDir) throws IOException {
File data = new File(storeDir, FILENAME);
if (!data.exists() || !data.canRead()) {
return false;
}
InputStream is = new BufferedInputStream(new FileInputStream(data));
try {
this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton());
cacheRootArcs();
} finally {
Closeables.closeQuietly(is);
}
return true;
}
/**
* Serialization to disk.
*/
@Override
public synchronized boolean store(File storeDir) throws IOException {
if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
return false;
}
if (this.automaton == null)
return false;
File data = new File(storeDir, FILENAME);
OutputStream os = new BufferedOutputStream(new FileOutputStream(data));
try {
this.automaton.save(new OutputStreamDataOutput(os));
} finally {
Closeables.closeQuietly(os);
}
return true;
}
}

View File

@ -0,0 +1,31 @@
package org.apache.solr.spelling.suggest.fst;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.lucene.store.DataInput;
import com.google.common.io.ByteStreams;
/**
* A {@link DataInput} wrapping a plain {@link InputStream}.
*/
public class InputStreamDataInput extends DataInput {
private final InputStream is;
public InputStreamDataInput(InputStream is) {
this.is = is;
}
@Override
public byte readByte() throws IOException {
int v = is.read();
if (v == -1) throw new EOFException();
return (byte) v;
}
@Override
public void readBytes(byte[] b, int offset, int len) throws IOException {
ByteStreams.readFully(is, b, offset, len);
}
}

View File

@ -0,0 +1,28 @@
package org.apache.solr.spelling.suggest.fst;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.lucene.store.DataOutput;
/**
* A {@link DataOutput} wrapping a plain {@link OutputStream}.
*/
public class OutputStreamDataOutput extends DataOutput {
private final OutputStream os;
public OutputStreamDataOutput(OutputStream os) {
this.os = os;
}
@Override
public void writeByte(byte b) throws IOException {
os.write(b);
}
@Override
public void writeBytes(byte[] b, int offset, int length) throws IOException {
os.write(b, offset, length);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -31,7 +31,7 @@
<requestHandler name="standard" class="solr.StandardRequestHandler" />
<!-- Suggest component -->
<searchComponent class="solr.SpellCheckComponent" name="suggest">
<searchComponent class="solr.SpellCheckComponent" name="suggest_jaspell">
<lst name="spellchecker">
<str name="name">suggest</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
@ -45,6 +45,38 @@
</lst>
</searchComponent>
<!-- TSTLookup suggest component -->
<searchComponent class="solr.SpellCheckComponent" name="suggest_tst">
<lst name="spellchecker">
<str name="name">suggest_tst</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
<str name="lookupImpl">org.apache.solr.spelling.suggest.tst.TSTLookup</str>
<str name="field">suggest</str>
<str name="storeDir">suggest_tst</str>
<str name="buildOnCommit">true</str>
<!-- Suggester properties -->
<float name="threshold">0.0</float>
</lst>
</searchComponent>
<!-- FSTLookup suggest component -->
<searchComponent class="solr.SpellCheckComponent" name="suggest_fst">
<lst name="spellchecker">
<str name="name">suggest_fst</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.FSTLookup</str>
<str name="field">suggest</str>
<str name="storeDir">suggest_fst</str>
<str name="buildOnCommit">true</str>
<!-- Suggester properties -->
<int name="weightBuckets">5</int>
<bool name="exactMatchFirst">true</bool>
</lst>
</searchComponent>
<!-- The default (jaspell) -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest">
<lst name="defaults">
<str name="spellcheck">true</str>
@ -52,7 +84,31 @@
<str name="spellcheck.collate">true</str>
</lst>
<arr name="components">
<str>suggest</str>
<str>suggest_jaspell</str>
</arr>
</requestHandler>
<!-- tst (ternary tree based) -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_tst">
<lst name="defaults">
<str name="spellcheck">true</str>
<str name="spellcheck.dictionary">suggest_tst</str>
<str name="spellcheck.collate">true</str>
</lst>
<arr name="components">
<str>suggest_tst</str>
</arr>
</requestHandler>
<!-- fst (finite state automaton based) -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_fst">
<lst name="defaults">
<str name="spellcheck">true</str>
<str name="spellcheck.dictionary">suggest_fst</str>
<str name="spellcheck.collate">false</str>
</lst>
<arr name="components">
<str>suggest_fst</str>
</arr>
</requestHandler>

View File

@ -0,0 +1,52 @@
package org.apache.solr.spelling.suggest;
import java.util.List;
import java.util.Locale;
/**
* Average with standard deviation.
*/
final class Average
{
/**
* Average (in milliseconds).
*/
public final double avg;
/**
* Standard deviation (in milliseconds).
*/
public final double stddev;
/**
*
*/
Average(double avg, double stddev)
{
this.avg = avg;
this.stddev = stddev;
}
public String toString()
{
return String.format(Locale.ENGLISH, "%.0f [+- %.2f]",
avg, stddev);
}
static Average from(List<Double> values)
{
double sum = 0;
double sumSquares = 0;
for (double l : values)
{
sum += l;
sumSquares += l * l;
}
double avg = sum / (double) values.size();
return new Average(
(sum / (double) values.size()),
Math.sqrt(sumSquares / (double) values.size() - avg * avg));
}
}

View File

@ -0,0 +1,230 @@
package org.apache.solr.spelling.suggest;
import java.net.URL;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import java.util.concurrent.Callable;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.solr.spelling.suggest.fst.FSTLookup;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
/**
* Benchmarks tests for implementations of {@link Lookup} interface.
*/
@Ignore // COMMENT ME TO RUN BENCHMARKS!
public class LookupBenchmarkTest {
@SuppressWarnings("unchecked")
private final List<Class<? extends Lookup>> benchmarkClasses = Lists.newArrayList(
JaspellLookup.class,
TSTLookup.class,
FSTLookup.class);
private final static int rounds = 15;
private final static int warmup = 5;
private final int num = 7;
private final boolean onlyMorePopular = true;
private final static Random random = new Random(0xdeadbeef);
/**
* Input term/weight pairs.
*/
private static TermFreq [] dictionaryInput;
/**
* Benchmark term/weight pairs (randomized order).
*/
private static List<TermFreq> benchmarkInput;
/**
* Loads terms and frequencies from Wikipedia (cached).
*/
@BeforeClass
public static void setup() throws Exception {
List<TermFreq> input = readTop50KWiki();
Collections.shuffle(input, random);
LookupBenchmarkTest.dictionaryInput = input.toArray(new TermFreq [input.size()]);
Collections.shuffle(input, random);
LookupBenchmarkTest.benchmarkInput = input;
}
/**
* Collect the multilingual input for benchmarks/ tests.
*/
public static List<TermFreq> readTop50KWiki() throws Exception {
List<TermFreq> input = Lists.newArrayList();
URL resource = Thread.currentThread().getContextClassLoader().getResource("Top50KWiki.utf8");
assert resource != null : "Resource missing: Top50KWiki.utf8";
for (String line : Resources.readLines(resource, Charsets.UTF_8)) {
int tab = line.indexOf('|');
Assert.assertTrue("No | separator?: " + line, tab >= 0);
float weight = Float.parseFloat(line.substring(tab + 1));
String key = line.substring(0, tab);
input.add(new TermFreq(key, weight));
}
return input;
}
/**
* Test construction time.
*/
@Test
public void testConstructionTime() throws Exception {
System.err.println("-- construction time");
for (final Class<? extends Lookup> cls : benchmarkClasses) {
BenchmarkResult result = measure(new Callable<Integer>() {
public Integer call() throws Exception {
final Lookup lookup = buildLookup(cls, dictionaryInput);
return lookup.hashCode();
}
});
System.err.println(
String.format(Locale.ENGLISH, "%-15s input: %d, time[ms]: %s",
cls.getSimpleName(),
dictionaryInput.length,
result.average.toString()));
}
}
/**
* Test memory required for the storage.
*/
@Test
public void testStorageNeeds() throws Exception {
System.err.println("-- RAM consumption");
final RamUsageEstimator rue = new RamUsageEstimator();
for (Class<? extends Lookup> cls : benchmarkClasses) {
Lookup lookup = buildLookup(cls, dictionaryInput);
System.err.println(
String.format(Locale.ENGLISH, "%-15s size[B]:%,13d",
lookup.getClass().getSimpleName(),
rue.estimateRamUsage(lookup)));
}
}
/**
* Create {@link Lookup} instance and populate it.
*/
private Lookup buildLookup(Class<? extends Lookup> cls, TermFreq[] input) throws Exception {
Lookup lookup = cls.newInstance();
lookup.build(new TermFreqArrayIterator(input));
return lookup;
}
/**
* Test performance of lookup on full hits.
*/
@Test
public void testPerformanceOnFullHits() throws Exception {
final int minPrefixLen = 100;
final int maxPrefixLen = 200;
runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
}
/**
* Test performance of lookup on longer term prefixes (6-9 letters or shorter).
*/
@Test
public void testPerformanceOnPrefixes6_9() throws Exception {
final int minPrefixLen = 6;
final int maxPrefixLen = 9;
runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
}
/**
* Test performance of lookup on short term prefixes (2-4 letters or shorter).
*/
@Test
public void testPerformanceOnPrefixes2_4() throws Exception {
final int minPrefixLen = 2;
final int maxPrefixLen = 4;
runPerformanceTest(minPrefixLen, maxPrefixLen, num, onlyMorePopular);
}
/**
* Run the actual benchmark.
*/
public void runPerformanceTest(final int minPrefixLen, final int maxPrefixLen,
final int num, final boolean onlyMorePopular) throws Exception {
System.err.println(String.format(Locale.ENGLISH,
"-- prefixes: %d-%d, num: %d, onlyMorePopular: %s",
minPrefixLen, maxPrefixLen, num, onlyMorePopular));
for (Class<? extends Lookup> cls : benchmarkClasses) {
final Lookup lookup = buildLookup(cls, dictionaryInput);
final List<String> input = Lists.newArrayList(Iterables.transform(benchmarkInput, new Function<TermFreq, String>() {
public String apply(TermFreq tf) {
return tf.term.substring(0, Math.min(tf.term.length(),
minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)));
}
}));
BenchmarkResult result = measure(new Callable<Integer>() {
public Integer call() throws Exception {
int v = 0;
for (String term : input) {
v += lookup.lookup(term, onlyMorePopular, num).size();
}
return v;
}
});
System.err.println(
String.format(Locale.ENGLISH, "%-15s queries: %d, time[ms]: %s, ~qps: %.0f",
lookup.getClass().getSimpleName(),
input.size(),
result.average.toString(),
input.size() / result.average.avg));
}
}
/**
* Do the measurements.
*/
private BenchmarkResult measure(Callable<Integer> callable) {
final double NANOS_PER_MS = 1000000;
try {
List<Double> times = Lists.newArrayList();
for (int i = 0; i < warmup + rounds; i++) {
final long start = System.nanoTime();
guard = callable.call().intValue();
times.add((System.nanoTime() - start) / NANOS_PER_MS);
}
return new BenchmarkResult(times, warmup, rounds);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/** Guard against opts. */
@SuppressWarnings("unused")
private static volatile int guard;
private static class BenchmarkResult {
/** Average time per round (ms). */
public final Average average;
public BenchmarkResult(List<Double> times, int warmup, int rounds) {
this.average = Average.from(times.subList(warmup, times.size()));
}
}
}

View File

@ -19,13 +19,13 @@ package org.apache.solr.spelling.suggest;
import java.io.File;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.spelling.suggest.fst.FSTLookup;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.junit.Test;
public class PersistenceTest extends SolrTestCaseJ4 {
public static final String[] keys = new String[] {
public final String[] keys = new String[] {
"one",
"two",
"three",
@ -40,41 +40,53 @@ public class PersistenceTest extends SolrTestCaseJ4 {
"threat",
"foundation",
"fourier",
"fourty"
};
"fourty"};
@Test
public void testTSTPersistence() throws Exception {
TSTLookup lookup = new TSTLookup();
for (String k : keys) {
lookup.add(k, new Float(k.length()));
}
File storeDir = new File(TEST_HOME());
lookup.store(storeDir);
lookup = new TSTLookup();
lookup.load(storeDir);
for (String k : keys) {
Float val = (Float)lookup.get(k);
assertNotNull(k, val);
assertEquals(k, k.length(), val.intValue());
}
runTest(TSTLookup.class, true);
}
@Test
public void testJaspellPersistence() throws Exception {
JaspellLookup lookup = new JaspellLookup();
for (String k : keys) {
lookup.add(k, new Float(k.length()));
}
File storeDir = new File(TEST_HOME());
lookup.store(storeDir);
lookup = new JaspellLookup();
lookup.load(storeDir);
for (String k : keys) {
Float val = (Float)lookup.get(k);
assertNotNull(k, val);
assertEquals(k, k.length(), val.intValue());
}
runTest(JaspellLookup.class, true);
}
@Test
public void testFSTPersistence() throws Exception {
runTest(FSTLookup.class, false);
}
private void runTest(Class<? extends Lookup> lookupClass,
boolean supportsExactWeights) throws Exception {
// Add all input keys.
Lookup lookup = lookupClass.newInstance();
TermFreq[] keys = new TermFreq[this.keys.length];
for (int i = 0; i < keys.length; i++)
keys[i] = new TermFreq(this.keys[i], (float) i);
lookup.build(new TermFreqArrayIterator(keys));
// Store the suggester.
File storeDir = new File(TEST_HOME());
lookup.store(storeDir);
// Re-read it from disk.
lookup = lookupClass.newInstance();
lookup.load(storeDir);
// Assert validity.
float previous = Float.NEGATIVE_INFINITY;
for (TermFreq k : keys) {
Float val = (Float) lookup.get(k.term);
assertNotNull(k.term, val);
if (supportsExactWeights) {
assertEquals(k.term, Float.valueOf(k.v), val);
} else {
assertTrue(val + ">=" + previous, val >= previous);
previous = val.floatValue();
}
}
}
}

View File

@ -0,0 +1,7 @@
package org.apache.solr.spelling.suggest;
public class SuggesterFSTTest extends SuggesterTest {
public SuggesterFSTTest() {
super.requestUri = "/suggest_fst";
}
}

View File

@ -0,0 +1,7 @@
package org.apache.solr.spelling.suggest;
public class SuggesterTSTTest extends SuggesterTest {
public SuggesterTSTTest() {
super.requestUri = "/suggest_tst";
}
}

View File

@ -17,28 +17,19 @@
package org.apache.solr.spelling.suggest;
import org.apache.lucene.util.RamUsageEstimator;
import java.io.File;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.SpellingParams;
import org.apache.solr.spelling.suggest.Lookup.LookupResult;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookup;
import org.apache.solr.spelling.suggest.tst.TSTLookup;
import org.apache.solr.util.TermFreqIterator;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import com.google.common.collect.Lists;
import java.io.File;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
public class SuggesterTest extends SolrTestCaseJ4 {
/**
* Expected URI at which the given suggester will live.
*/
protected String requestUri = "/suggest";
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-spellchecker.xml","schema-spellchecker.xml");
@ -59,10 +50,9 @@ public class SuggesterTest extends SolrTestCaseJ4 {
@Test
public void testSuggestions() throws Exception {
addDocs();
assertU(commit()); // configured to do a rebuild on commit
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']"
@ -82,7 +72,7 @@ public class SuggesterTest extends SolrTestCaseJ4 {
dataDir = data;
configString = config;
initCore();
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[1][.='acquire']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/arr[@name='suggestion']/str[2][.='accommodate']"
@ -96,132 +86,13 @@ public class SuggesterTest extends SolrTestCaseJ4 {
public void testRebuild() throws Exception {
addDocs();
assertU(commit());
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
assertU(adoc("id", "4",
"text", "actually"
));
assertU(commit());
assertQ(req("qt","/suggest", "q","ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
assertQ(req("qt", requestUri, "q", "ac", SpellingParams.SPELLCHECK_COUNT, "2", SpellingParams.SPELLCHECK_ONLY_MORE_POPULAR, "true"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ac']/int[@name='numFound'][.='2']");
}
private TermFreqIterator getTFIT() {
final int count = 100000;
TermFreqIterator tfit = new TermFreqIterator() {
Random r = new Random(1234567890L);
Random r1 = new Random(1234567890L);
int pos;
public float freq() {
return r1.nextInt(4);
}
public boolean hasNext() {
return pos < count;
}
public String next() {
pos++;
return Long.toString(r.nextLong());
}
public void remove() {
throw new UnsupportedOperationException();
}
};
return tfit;
}
static class Bench {
long buildTime;
long lookupTime;
}
@Test @Ignore
public void testBenchmark() throws Exception {
final List<Class<? extends Lookup>> benchmarkClasses = Lists.newArrayList();
benchmarkClasses.add(JaspellLookup.class);
benchmarkClasses.add(TSTLookup.class);
// Run a single pass just to see if everything works fine and provide size estimates.
final RamUsageEstimator rue = new RamUsageEstimator();
for (Class<? extends Lookup> cls : benchmarkClasses) {
Lookup lookup = singleBenchmark(cls, null);
System.err.println(
String.format(Locale.ENGLISH,
"%20s, size[B]=%,d",
lookup.getClass().getSimpleName(),
rue.estimateRamUsage(lookup)));
}
int warmupCount = 10;
int measuredCount = 100;
for (Class<? extends Lookup> cls : benchmarkClasses) {
Bench b = fullBenchmark(cls, warmupCount, measuredCount);
System.err.println(String.format(Locale.ENGLISH,
"%s: buildTime[ms]=%,d lookupTime[ms]=%,d",
cls.getSimpleName(),
(b.buildTime / measuredCount),
(b.lookupTime / measuredCount / 1000000)));
}
}
private Lookup singleBenchmark(Class<? extends Lookup> cls, Bench bench) throws Exception {
Lookup lookup = cls.newInstance();
long start = System.currentTimeMillis();
lookup.build(getTFIT());
long buildTime = System.currentTimeMillis() - start;
TermFreqIterator tfit = getTFIT();
long elapsed = 0;
while (tfit.hasNext()) {
String key = tfit.next();
// take only the first part of the key
int len = key.length() > 4 ? key.length() / 3 : 2;
String prefix = key.substring(0, len);
start = System.nanoTime();
List<LookupResult> res = lookup.lookup(prefix, true, 10);
elapsed += System.nanoTime() - start;
assertTrue(res.size() > 0);
for (LookupResult lr : res) {
assertTrue(lr.key.startsWith(prefix));
}
}
if (bench != null) {
bench.buildTime += buildTime;
bench.lookupTime += elapsed;
}
return lookup;
}
private Bench fullBenchmark(Class<? extends Lookup> cls, int warmupCount, int measuredCount) throws Exception {
System.err.println("* Running " + measuredCount + " iterations for " + cls.getSimpleName() + " ...");
System.err.println(" - warm-up " + warmupCount + " iterations...");
for (int i = 0; i < warmupCount; i++) {
System.runFinalization();
System.gc();
singleBenchmark(cls, null);
}
Bench b = new Bench();
System.err.print(" - main iterations:"); System.err.flush();
for (int i = 0; i < measuredCount; i++) {
System.runFinalization();
System.gc();
singleBenchmark(cls, b);
if (i > 0 && (i % 10 == 0)) {
System.err.print(" " + i);
System.err.flush();
}
}
System.err.println();
return b;
}
}

View File

@ -0,0 +1,11 @@
package org.apache.solr.spelling.suggest;
public final class TermFreq {
public final String term;
public final float v;
public TermFreq(String term, float v) {
this.term = term;
this.v = v;
}
}

View File

@ -0,0 +1,40 @@
package org.apache.solr.spelling.suggest;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.solr.util.TermFreqIterator;
/**
* A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
*/
public final class TermFreqArrayIterator implements TermFreqIterator {
private final Iterator<TermFreq> i;
private TermFreq current;
public TermFreqArrayIterator(Iterator<TermFreq> i) {
this.i = i;
}
public TermFreqArrayIterator(TermFreq [] i) {
this(Arrays.asList(i));
}
public TermFreqArrayIterator(Iterable<TermFreq> i) {
this(i.iterator());
}
public float freq() {
return current.v;
}
public boolean hasNext() {
return i.hasNext();
}
public String next() {
return (current = i.next()).term;
}
public void remove() { throw new UnsupportedOperationException(); }
}

View File

@ -0,0 +1,155 @@
package org.apache.solr.spelling.suggest.fst;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.spelling.suggest.Lookup.LookupResult;
import org.apache.solr.spelling.suggest.LookupBenchmarkTest;
import org.apache.solr.spelling.suggest.TermFreq;
import org.apache.solr.spelling.suggest.TermFreqArrayIterator;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import com.google.common.collect.Lists;
/**
* Unit tests for {@link FSTLookup}.
*/
public class FSTLookupTest extends LuceneTestCase {
public static TermFreq tf(String t, float v) {
return new TermFreq(t, v);
}
private FSTLookup lookup;
@Before
public void prepare() throws Exception {
final TermFreq[] keys = new TermFreq[] {
tf("one", 0.5f),
tf("oneness", 1),
tf("onerous", 1),
tf("onesimus", 1),
tf("two", 1),
tf("twofold", 1),
tf("twonk", 1),
tf("thrive", 1),
tf("through", 1),
tf("threat", 1),
tf("three", 1),
tf("foundation", 1),
tf("fourier", 1),
tf("four", 1),
tf("fourty", 1),
tf("xo", 1),
};
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(keys));
}
@Test
public void testExactMatchHighPriority() throws Exception {
assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0");
}
@Test
public void testExactMatchLowPriority() throws Exception {
assertMatchEquals(lookup.lookup("one", true, 2),
"one/0.0",
"oneness/1.0");
}
@Test
public void testMiss() throws Exception {
assertMatchEquals(lookup.lookup("xyz", true, 1));
}
@Test
public void testAlphabeticWithWeights() throws Exception {
assertEquals(0, lookup.lookup("xyz", false, 1).size());
}
@Test
public void testFullMatchList() throws Exception {
assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE),
"oneness/1.0",
"onerous/1.0",
"onesimus/1.0",
"one/0.0");
}
@Test
public void testMultilingualInput() throws Exception {
List<TermFreq> input = LookupBenchmarkTest.readTop50KWiki();
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(input));
for (TermFreq tf : input) {
assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null);
assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key);
}
}
@Test
public void testEmptyInput() throws Exception {
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(new TermFreq[0]));
assertMatchEquals(lookup.lookup("", true, 10));
}
@Test
public void testRandom() throws Exception {
List<TermFreq> freqs = Lists.newArrayList();
Random rnd = random;
for (int i = 0; i < 5000; i++) {
freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100)));
}
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()])));
for (TermFreq tf : freqs) {
final String term = tf.term;
for (int i = 1; i < term.length(); i++) {
String prefix = term.substring(0, i);
for (LookupResult lr : lookup.lookup(prefix, true, 10)) {
Assert.assertTrue(lr.key.startsWith(prefix));
}
}
}
}
private void assertMatchEquals(List<LookupResult> res, String... expected) {
String [] result = new String [res.size()];
for (int i = 0; i < res.size(); i++)
result[i] = res.get(i).toString();
if (!Arrays.equals(expected, result)) {
int colLen = Math.max(maxLen(expected), maxLen(result));
StringBuilder b = new StringBuilder();
String format = "%" + colLen + "s " + "%" + colLen + "s\n";
b.append(String.format(Locale.ENGLISH, format, "Expected", "Result"));
for (int i = 0; i < Math.max(result.length, expected.length); i++) {
b.append(String.format(Locale.ENGLISH, format,
i < expected.length ? expected[i] : "--",
i < result.length ? result[i] : "--"));
}
System.err.println(b.toString());
fail("Expected different output:\n" + b.toString());
}
}
private int maxLen(String[] result) {
int len = 0;
for (String s : result)
len = Math.max(len, s.length());
return len;
}
}