SOLR-2888: FSTSuggester refactoring: internal storage is now UTF-8,

external sorting (on disk) prevents OOMs even with large data sets
(the bottleneck is now FST construction), code cleanups and API cleanups.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1209265 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2011-12-01 21:49:27 +00:00
parent a7772b4958
commit 92de7e8848
19 changed files with 2066 additions and 630 deletions

View File

@ -17,12 +17,22 @@ package org.apache.lucene.util.fst;
* limitations under the License.
*/
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.Builder.UnCompiledNode;
// TODO: if FST is pure prefix trie we can do a more compact
@ -334,6 +344,43 @@ public class FST<T> {
out.writeVInt(bytes.length);
out.writeBytes(bytes, 0, bytes.length);
}
/**
* Writes an automaton to a file.
*/
public void save(final File file) throws IOException {
boolean success = false;
OutputStream os = new BufferedOutputStream(new FileOutputStream(file));
try {
save(new OutputStreamDataOutput(os));
success = true;
} finally {
if (success) {
IOUtils.close(os);
} else {
IOUtils.closeWhileHandlingException(os);
}
}
}
/**
* Reads an automaton from a file.
*/
public static <T> FST<T> read(File file, Outputs<T> outputs) throws IOException {
InputStream is = new BufferedInputStream(new FileInputStream(file));
boolean success = false;
try {
FST<T> fst = new FST<T>(new InputStreamDataInput(is), outputs);
success = true;
return fst;
} finally {
if (success) {
IOUtils.close(is);
} else {
IOUtils.closeWhileHandlingException(is);
}
}
}
private void writeLabel(int v) throws IOException {
assert v >= 0: "v=" + v;

View File

@ -0,0 +1,29 @@
package org.apache.lucene.search.suggest.fst;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.util.BytesRef;
/**
* Collects {@link BytesRef} and then allows one to iterate over their sorted order. Implementations
* of this interface will be called in a single-threaded scenario.
*/
public interface BytesRefSorter {
/**
* Adds a single suggestion entry (possibly compound with its bucket).
*
* @throws IOException If an I/O exception occurs.
* @throws IllegalStateException If an addition attempt is performed after
* a call to {@link #iterator()} has been made.
*/
void add(BytesRef utf8) throws IOException, IllegalStateException;
/**
* Sorts the entries added in {@link #add(BytesRef)} and returns
* an iterator over all sorted entries.
*
* @throws IOException If an I/O exception occurs.
*/
Iterator<BytesRef> iterator() throws IOException;
}

View File

@ -0,0 +1,105 @@
package org.apache.lucene.search.suggest.fst;
import java.io.*;
import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
import org.apache.lucene.util.BytesRef;
/**
* Builds and iterates over sequences stored on disk.
*/
public class ExternalRefSorter implements BytesRefSorter, Closeable {
private final Sort sort;
private Sort.ByteSequencesWriter writer;
private File input;
private File sorted;
/**
* Will buffer all sequences to a temporary file and then sort (all on-disk).
*/
public ExternalRefSorter(Sort sort) throws IOException {
this.sort = sort;
this.input = File.createTempFile("RefSorter-", ".raw", Sort.defaultTempDir());
this.writer = new Sort.ByteSequencesWriter(input);
}
@Override
public void add(BytesRef utf8) throws IOException {
if (writer == null)
throw new IllegalStateException();
writer.write(utf8);
}
@Override
public Iterator<BytesRef> iterator() throws IOException {
if (sorted == null) {
closeWriter();
sorted = File.createTempFile("RefSorter-", ".sorted", Sort.defaultTempDir());
sort.sort(input, sorted);
input.delete();
input = null;
}
return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted));
}
private void closeWriter() throws IOException {
if (writer != null) {
writer.close();
writer = null;
}
}
/**
* Removes any written temporary files.
*/
@Override
public void close() throws IOException {
try {
closeWriter();
} finally {
if (input != null) input.delete();
if (sorted != null) sorted.delete();
}
}
/**
* Iterate over byte refs in a file.
*/
class ByteSequenceIterator implements Iterator<BytesRef> {
private ByteSequencesReader reader;
private byte[] next;
public ByteSequenceIterator(ByteSequencesReader reader) throws IOException {
this.reader = reader;
this.next = reader.read();
}
@Override
public boolean hasNext() {
return next != null;
}
@Override
public BytesRef next() {
if (next == null) throw new NoSuchElementException();
BytesRef r = new BytesRef(next);
try {
next = reader.read();
if (next == null) {
reader.close();
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return r;
}
@Override
public void remove() { throw new UnsupportedOperationException(); }
}
}

View File

@ -0,0 +1,381 @@
package org.apache.lucene.search.suggest.fst;
import java.io.IOException;
import java.util.*;
import org.apache.lucene.util.*;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
/**
* Finite state automata based implementation of "autocomplete" functionality.
*
* @see FSTCompletionBuilder
*/
// TODO: we could store exact weights as outputs from the FST (int4 encoded
// floats). This would provide exact outputs from this method and to some
// degree allowed post-sorting on a more fine-grained weight.
// TODO: support for Analyzers (infix suggestions, synonyms?)
public class FSTCompletion {
/**
* A single completion for a given key.
*/
public static final class Completion implements Comparable<Completion> {
public final BytesRef utf8;
public final int bucket;
Completion(BytesRef key, int bucket) {
this.utf8 = BytesRef.deepCopyOf(key);
this.bucket = bucket;
}
@Override
public String toString() {
return utf8.utf8ToString() + "/" + bucket;
}
/** @see BytesRef#compareTo(BytesRef) */
public int compareTo(Completion o) {
return this.utf8.compareTo(o.utf8);
}
}
/**
* Default number of buckets.
*/
public static final int DEFAULT_BUCKETS = 10;
/**
* An empty result. Keep this an {@link ArrayList} to keep all the returned
* lists of single type (monomorphic calls).
*/
private static final ArrayList<Completion> EMPTY_RESULT = new ArrayList<Completion>();
/**
* Finite state automaton encoding all the lookup terms. See class notes for
* details.
*/
private final FST<Object> automaton;
/**
* An array of arcs leaving the root automaton state and encoding weights of
* all completions in their sub-trees.
*/
private final Arc<Object>[] rootArcs;
/**
* @see #FSTCompletion(FST, boolean, boolean)
*/
private boolean exactFirst;
/**
* @see #FSTCompletion(FST, boolean, boolean)
*/
private boolean higherWeightsFirst;
/**
* @param automaton
* Automaton with completions. See {@link FSTCompletionBuilder}.
* @param higherWeightsFirst
* Return most popular suggestions first. This is the default
* behavior for this implementation. Setting it to <code>false</code>
* has no effect (use constant term weights to sort alphabetically
* only).
* @param exactFirst
* Find and push an exact match to the first position of the result
* list if found.
*/
@SuppressWarnings("unchecked")
public FSTCompletion(FST<Object> automaton, boolean higherWeightsFirst, boolean exactFirst) {
this.automaton = automaton;
if (automaton != null) {
this.rootArcs = cacheRootArcs(automaton);
} else {
this.rootArcs = new Arc[0];
}
this.higherWeightsFirst = higherWeightsFirst;
this.exactFirst = exactFirst;
}
/**
* Defaults to higher weights first and exact first.
* @see #FSTCompletion(FST, boolean, boolean)
*/
public FSTCompletion(FST<Object> automaton) {
this(automaton, true, true);
}
/**
* Cache the root node's output arcs starting with completions with the
* highest weights.
*/
@SuppressWarnings({"all"})
private static Arc<Object>[] cacheRootArcs(FST<Object> automaton) {
try {
List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>();
Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
automaton.readFirstTargetArc(arc, arc);
while (true) {
rootArcs.add(new Arc<Object>().copyFrom(arc));
if (arc.isLast()) break;
automaton.readNextArc(arc);
}
Collections.reverse(rootArcs); // we want highest weights first.
return rootArcs.toArray(new Arc[rootArcs.size()]);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Returns the first exact match by traversing root arcs, starting from the
* arc <code>rootArcIndex</code>.
*
* @param rootArcIndex
* The first root arc index in {@link #rootArcs} to consider when
* matching.
*
* @param utf8
* The sequence of utf8 bytes to follow.
*
* @return Returns the bucket number of the match or <code>null</code> if no
* match was found.
*/
private Integer getExactMatchStartingFromRootArc(
int rootArcIndex, BytesRef utf8) {
// Get the UTF-8 bytes representation of the input key.
try {
final FST.Arc<Object> scratch = new FST.Arc<Object>();
for (; rootArcIndex < rootArcs.length; rootArcIndex++) {
final FST.Arc<Object> rootArc = rootArcs[rootArcIndex];
final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
// Descend into the automaton using the key as prefix.
if (descendWithPrefix(arc, utf8)) {
automaton.readFirstTargetArc(arc, arc);
if (arc.label == FST.END_LABEL) {
// Normalize prefix-encoded weight.
return rootArc.label;
}
}
}
} catch (IOException e) {
// Should never happen, but anyway.
throw new RuntimeException(e);
}
// No match.
return null;
}
/**
* Lookup suggestions to <code>key</code>.
*
* @param key
* The prefix to which suggestions should be sought.
* @param num
* At most this number of suggestions will be returned.
* @return Returns the suggestions, sorted by their approximated weight first
* (decreasing) and then alphabetically (UTF-8 codepoint order).
*/
public List<Completion> lookup(String key, int num) {
if (key.length() == 0 || automaton == null) {
return EMPTY_RESULT;
}
try {
BytesRef keyUtf8 = new BytesRef(key);
if (!higherWeightsFirst && rootArcs.length > 1) {
// We could emit a warning here (?). An optimal strategy for
// alphabetically sorted
// suggestions would be to add them with a constant weight -- this saves
// unnecessary
// traversals and sorting.
return lookupSortedAlphabetically(keyUtf8, num);
} else {
return lookupSortedByWeight(keyUtf8, num, false);
}
} catch (IOException e) {
// Should never happen, but anyway.
throw new RuntimeException(e);
}
}
/**
* Lookup suggestions sorted alphabetically <b>if weights are not
* constant</b>. This is a workaround: in general, use constant weights for
* alphabetically sorted result.
*/
private List<Completion> lookupSortedAlphabetically(BytesRef key, int num)
throws IOException {
// Greedily get num results from each weight branch.
List<Completion> res = lookupSortedByWeight(key, num, true);
// Sort and trim.
Collections.sort(res);
if (res.size() > num) {
res = res.subList(0, num);
}
return res;
}
/**
* Lookup suggestions sorted by weight (descending order).
*
* @param collectAll
* If <code>true</code>, the routine terminates immediately when
* <code>num</code> suggestions have been collected. If
* <code>false</code>, it will collect suggestions from all weight
* arcs (needed for {@link #lookupSortedAlphabetically}.
*/
private ArrayList<Completion> lookupSortedByWeight(BytesRef key,
int num, boolean collectAll) throws IOException {
// Don't overallocate the results buffers. This also serves the purpose of
// allowing the user of this class to request all matches using Integer.MAX_VALUE as
// the number of results.
final ArrayList<Completion> res = new ArrayList<Completion>(Math.min(10, num));
final BytesRef output = BytesRef.deepCopyOf(key);
for (int i = 0; i < rootArcs.length; i++) {
final FST.Arc<Object> rootArc = rootArcs[i];
final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
// Descend into the automaton using the key as prefix.
if (descendWithPrefix(arc, key)) {
// A subgraph starting from the current node has the completions
// of the key prefix. The arc we're at is the last key's byte,
// so we will collect it too.
output.length = key.length - 1;
if (collect(res, num, rootArc.label, output, arc) && !collectAll) {
// We have enough suggestions to return immediately. Keep on looking
// for an
// exact match, if requested.
if (exactFirst) {
if (!checkExistingAndReorder(res, key)) {
Integer exactMatchBucket = getExactMatchStartingFromRootArc(i, key);
if (exactMatchBucket != null) {
// Insert as the first result and truncate at num.
while (res.size() >= num) {
res.remove(res.size() - 1);
}
res.add(0, new Completion(key, exactMatchBucket));
}
}
}
break;
}
}
}
return res;
}
/**
* Checks if the list of
* {@link org.apache.lucene.search.suggest.Lookup.LookupResult}s already has a
* <code>key</code>. If so, reorders that
* {@link org.apache.lucene.search.suggest.Lookup.LookupResult} to the first
* position.
*
* @return Returns <code>true<code> if and only if <code>list</code> contained
* <code>key</code>.
*/
private boolean checkExistingAndReorder(ArrayList<Completion> list, BytesRef key) {
// We assume list does not have duplicates (because of how the FST is created).
for (int i = list.size(); --i >= 0;) {
if (key.equals(list.get(i).utf8)) {
// Key found. Unless already at i==0, remove it and push up front so
// that the ordering
// remains identical with the exception of the exact match.
list.add(0, list.remove(i));
return true;
}
}
return false;
}
/**
* Descend along the path starting at <code>arc</code> and going through bytes
* in the argument.
*
* @param arc
* The starting arc. This argument is modified in-place.
* @param utf8
* The term to descend along.
* @return If <code>true</code>, <code>arc</code> will be set to the arc
* matching last byte of <code>term</code>. <code>false</code> is
* returned if no such prefix exists.
*/
private boolean descendWithPrefix(Arc<Object> arc, BytesRef utf8)
throws IOException {
final int max = utf8.offset + utf8.length;
for (int i = utf8.offset; i < max; i++) {
if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc) == null) {
// No matching prefixes, return an empty result.
return false;
}
}
return true;
}
/**
* Recursive collect lookup results from the automaton subgraph starting at
* <code>arc</code>.
*
* @param num
* Maximum number of results needed (early termination).
*/
private boolean collect(List<Completion> res, int num, int bucket,
BytesRef output, Arc<Object> arc) throws IOException {
if (output.length == output.bytes.length) {
output.bytes = ArrayUtil.grow(output.bytes);
}
assert output.offset == 0;
output.bytes[output.length++] = (byte) arc.label;
automaton.readFirstTargetArc(arc, arc);
while (true) {
if (arc.label == FST.END_LABEL) {
res.add(new Completion(output, bucket));
if (res.size() >= num) return true;
} else {
int save = output.length;
if (collect(res, num, bucket, output, new Arc<Object>().copyFrom(arc))) {
return true;
}
output.length = save;
}
if (arc.isLast()) {
break;
}
automaton.readNextArc(arc);
}
return false;
}
/**
* Returns the bucket count (discretization thresholds).
*/
public int getBucketCount() {
return rootArcs.length;
}
/**
* Returns the bucket assigned to a given key (if found) or <code>null</code> if
* no exact match exists.
*/
public Integer getBucket(String key) {
return getExactMatchStartingFromRootArc(0, new BytesRef(key));
}
/**
* Returns the internal automaton.
*/
public FST<Object> getFST() {
return automaton;
}
}

View File

@ -0,0 +1,233 @@
package org.apache.lucene.search.suggest.fst;
import java.io.Closeable;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.fst.*;
/**
* Finite state automata based implementation of "autocomplete" functionality.
*
* <h2>Implementation details</h2>
*
* <p>
* The construction step in {@link #finalize()} works as follows:
* <ul>
* <li>A set of input terms and their buckets is given.</li>
* <li>All terms in the input are prefixed with a synthetic pseudo-character
* (code) of the weight bucket the term fell into. For example a term
* <code>abc</code> with a discretized weight equal '1' would become
* <code>1abc</code>.</li>
* <li>The terms are then sorted by their raw value of UTF-8 character values
* (including the synthetic bucket code in front).</li>
* <li>A finite state automaton ({@link FST}) is constructed from the input. The
* root node has arcs labeled with all possible weights. We cache all these
* arcs, highest-weight first.</li>
* </ul>
*
* <p>
* At runtime, in {@link FSTCompletion#lookup(String, int)},
* the automaton is utilized as follows:
* <ul>
* <li>For each possible term weight encoded in the automaton (cached arcs from
* the root above), starting with the highest one, we descend along the path of
* the input key. If the key is not a prefix of a sequence in the automaton
* (path ends prematurely), we exit immediately -- no completions.</li>
* <li>Otherwise, we have found an internal automaton node that ends the key.
* <b>The entire subautomaton (all paths) starting from this node form the key's
* completions.</b> We start the traversal of this subautomaton. Every time we
* reach a final state (arc), we add a single suggestion to the list of results
* (the weight of this suggestion is constant and equal to the root path we
* started from). The tricky part is that because automaton edges are sorted and
* we scan depth-first, we can terminate the entire procedure as soon as we
* collect enough suggestions the user requested.</li>
* <li>In case the number of suggestions collected in the step above is still
* insufficient, we proceed to the next (smaller) weight leaving the root node
* and repeat the same algorithm again.</li>
* </ul>
*
* <h2>Runtime behavior and performance characteristic</h2>
*
* The algorithm described above is optimized for finding suggestions to short
* prefixes in a top-weights-first order. This is probably the most common use
* case: it allows presenting suggestions early and sorts them by the global
* frequency (and then alphabetically).
*
* <p>
* If there is an exact match in the automaton, it is returned first on the
* results list (even with by-weight sorting).
*
* <p>
* Note that the maximum lookup time for <b>any prefix</b> is the time of
* descending to the subtree, plus traversal of the subtree up to the number of
* requested suggestions (because they are already presorted by weight on the
* root level and alphabetically at any node level).
*
* <p>
* To order alphabetically only (no ordering by priorities), use identical term
* weights for all terms. Alphabetical suggestions are returned even if
* non-constant weights are used, but the algorithm for doing this is
* suboptimal.
*
* <p>
* "alphabetically" in any of the documentation above indicates UTF-8
* representation order, nothing else.
*
* <p>
* <b>NOTE</b>: the FST file format is experimental and subject to suddenly
* change, requiring you to rebuild the FST suggest index.
*
* @see FSTCompletion
*/
public class FSTCompletionBuilder {
/**
* Default number of buckets.
*/
public static final int DEFAULT_BUCKETS = 10;
/**
* The number of separate buckets for weights (discretization). The more
* buckets, the more fine-grained term weights (priorities) can be assigned.
* The speed of lookup will not decrease for prefixes which have
* highly-weighted completions (because these are filled-in first), but will
* decrease significantly for low-weighted terms (but these should be
* infrequent, so it is all right).
*
* <p>
* The number of buckets must be within [1, 255] range.
*/
private final int buckets;
/**
* Finite state automaton encoding all the lookup terms. See class notes for
* details.
*/
FST<Object> automaton;
/**
* FST construction require re-sorting the input. This is the class that
* collects all the input entries, their weights and then provides sorted
* order.
*/
private final BytesRefSorter sorter;
/**
* Scratch buffer for {@link #add(BytesRef, int)}.
*/
private final BytesRef scratch = new BytesRef();
/**
* Max tail sharing length.
*/
private final int shareMaxTailLength;
/**
* Creates an {@link FSTCompletion} with default options: 10 buckets, exact match
* promoted to first position and {@link InMemorySorter}.
*/
public FSTCompletionBuilder() {
this(DEFAULT_BUCKETS, new InMemorySorter(), Integer.MAX_VALUE);
}
/**
* @param buckets
* The number of buckets for weight discretization. Buckets are used
* in {@link #add(BytesRef, int)} and must be smaller than the number
* given here.
*
* @param sorter
* {@link BytesRefSorter} used for re-sorting input for the automaton.
* For large inputs, use on-disk sorting implementations. The sorter
* is closed automatically in {@link #build()} if it implements
* {@link Closeable}.
*
* @param shareMaxTailLength
* Max shared suffix sharing length.
*
* See the description of this parameter in {@link Builder}'s constructor.
* In general, for very large inputs you'll want to construct a non-minimal
* automaton which will be larger, but the construction will take far less ram.
* For minimal automata, set it to {@link Integer#MAX_VALUE}.
*/
public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, int shareMaxTailLength) {
if (buckets < 1 || buckets > 255) {
throw new IllegalArgumentException("Buckets must be >= 1 and <= 255: "
+ buckets);
}
if (sorter == null) throw new IllegalArgumentException(
"BytesRefSorter must not be null.");
this.sorter = sorter;
this.buckets = buckets;
this.shareMaxTailLength = shareMaxTailLength;
}
/**
* Appends a single suggestion and its weight to the internal buffers.
*
* @param utf8
* The suggestion (utf8 representation) to be added. The content is
* copied and the object can be reused.
* @param bucket
* The bucket to place this suggestion in. Must be non-negative and
* smaller than the number of buckets passed in the constructor.
* Higher numbers indicate suggestions that should be presented
* before suggestions placed in smaller buckets.
*/
public void add(BytesRef utf8, int bucket) throws IOException {
if (bucket < 0 || bucket >= buckets) {
throw new IllegalArgumentException(
"Bucket outside of the allowed range [0, " + buckets + "): " + bucket);
}
if (scratch.bytes.length < utf8.length + 1) {
scratch.grow(utf8.length + 10);
}
scratch.length = 1;
scratch.bytes[0] = (byte) bucket;
scratch.append(utf8);
sorter.add(scratch);
}
/**
* Builds the final automaton from a list of added entries. This method may
* take a longer while as it needs to build the automaton.
*/
public FSTCompletion build() throws IOException {
this.automaton = buildAutomaton(sorter);
if (sorter instanceof Closeable) {
((Closeable) sorter).close();
}
return new FSTCompletion(automaton);
}
/**
* Builds the final automaton from a list of entries.
*/
private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
// Build the automaton.
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object empty = outputs.getNoOutput();
final Builder<Object> builder = new Builder<Object>(
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
shareMaxTailLength, outputs, null);
BytesRef scratch = new BytesRef();
int count = 0;
for (Iterator<BytesRef> i = sorter.iterator(); i.hasNext(); count++) {
BytesRef entry = i.next();
if (scratch.compareTo(entry) != 0) {
builder.add(entry, empty);
scratch.copyBytes(entry);
}
}
return count == 0 ? null : builder.finish();
}
}

View File

@ -0,0 +1,251 @@
package org.apache.lucene.search.suggest.fst;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
import org.apache.lucene.search.suggest.fst.Sort.SortInfo;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.*;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.NoOutputs;
/**
* An adapter from {@link Lookup} API to {@link FSTCompletion}.
*
* <p>This adapter differs from {@link FSTCompletion} in that it attempts
* to discretize any "weights" as passed from in {@link TermFreqIterator#freq()}
* to match the number of buckets. For the rationale for bucketing, see
* {@link FSTCompletion}.
*
* <p><b>Note:</b>Discretization requires an additional sorting pass.
*
* <p>The range of weights for bucketing/ discretization is determined
* by sorting the input by weight and then dividing into
* equal ranges. Then, scores within each range are assigned to that bucket.
*
* <p>Note that this means that even large differences in weights may be lost
* during automaton construction, but the overall distinction between "classes"
* of weights will be preserved regardless of the distribution of weights.
*
* <p>For fine-grained control over which weights are assigned to which buckets,
* use {@link FSTCompletion} directly or {@link TSTLookup}, for example.
*
* @see FSTCompletion
*/
public class FSTCompletionLookup extends Lookup {
/**
* Shared tail length for conflating in the created automaton. Setting this
* to larger values ({@link Integer#MAX_VALUE}) will create smaller (or minimal)
* automata at the cost of RAM for keeping nodes hash in the {@link FST}.
*
* <p>Empirical pick.
*/
private final static int sharedTailLength = 5;
/**
* File name for the automaton.
*
* @see #store(File)
* @see #load(File)
*/
private static final String FILENAME = "fst.bin";
private int buckets;
private boolean exactMatchFirst;
/**
* Automaton used for completions with higher weights reordering.
*/
private FSTCompletion higherWeightsCompletion;
/**
* Automaton used for normal completions.
*/
private FSTCompletion normalCompletion;
/*
*
*/
public FSTCompletionLookup() {
this(FSTCompletion.DEFAULT_BUCKETS, true);
}
/*
*
*/
public FSTCompletionLookup(FSTCompletion completion, int buckets, boolean exactMatchFirst) {
this(buckets, exactMatchFirst);
this.normalCompletion = new FSTCompletion(
completion.getFST(), false, exactMatchFirst);
this.higherWeightsCompletion = new FSTCompletion(
completion.getFST(), true, exactMatchFirst);
}
/*
*
*/
public FSTCompletionLookup(int buckets, boolean exactMatchFirst) {
this.buckets = buckets;
this.exactMatchFirst = exactMatchFirst;
}
/*
*
*/
@Override
public void build(TermFreqIterator tfit) throws IOException {
File tempInput = File.createTempFile(
FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir());
File tempSorted = File.createTempFile(
FSTCompletionLookup.class.getSimpleName(), ".sorted", Sort.defaultTempDir());
Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
Sort.ByteSequencesReader reader = null;
// Push floats up front before sequences to sort them. For now, assume they are non-negative.
// If negative floats are allowed some trickery needs to be done to find their byte order.
boolean success = false;
try {
BytesRef tmp1 = new BytesRef();
byte [] buffer = new byte [0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
while (tfit.hasNext()) {
String key = tfit.next();
UnicodeUtil.UTF16toUTF8(key, 0, key.length(), tmp1);
if (tmp1.length + 4 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, tmp1.length + 4);
}
output.reset(buffer);
output.writeInt(FloatMagic.toSortable(tfit.freq()));
output.writeBytes(tmp1.bytes, tmp1.offset, tmp1.length);
writer.write(buffer, 0, output.getPosition());
}
writer.close();
// We don't know the distribution of scores and we need to bucket them, so we'll sort
// and divide into equal buckets.
SortInfo info = new Sort().sort(tempInput, tempSorted);
tempInput.delete();
FSTCompletionBuilder builder = new FSTCompletionBuilder(
buckets, new ExternalRefSorter(new Sort()), sharedTailLength);
final int inputLines = info.lines;
reader = new Sort.ByteSequencesReader(tempSorted);
long line = 0;
int previousBucket = 0;
float previousScore = 0;
ByteArrayDataInput input = new ByteArrayDataInput();
BytesRef tmp2 = new BytesRef();
while (reader.read(tmp1)) {
input.reset(tmp1.bytes);
float currentScore = FloatMagic.fromSortable(input.readInt());
int bucket;
if (line > 0 && currentScore == previousScore) {
bucket = previousBucket;
} else {
bucket = (int) (line * buckets / inputLines);
}
previousScore = currentScore;
previousBucket = bucket;
// Only append the input, discard the weight.
tmp2.bytes = tmp1.bytes;
tmp2.offset = input.getPosition();
tmp2.length = tmp1.length - input.getPosition();
builder.add(tmp2, bucket);
line++;
}
// The two FSTCompletions share the same automaton.
this.higherWeightsCompletion = builder.build();
this.normalCompletion = new FSTCompletion(
higherWeightsCompletion.getFST(), false, exactMatchFirst);
success = true;
} finally {
if (success)
IOUtils.close(reader, writer);
else
IOUtils.closeWhileHandlingException(reader, writer);
tempInput.delete();
tempSorted.delete();
}
}
@Override
public List<LookupResult> lookup(String key, boolean higherWeightsFirst, int num) {
final List<Completion> completions;
if (higherWeightsFirst) {
completions = higherWeightsCompletion.lookup(key, num);
} else {
completions = normalCompletion.lookup(key, num);
}
final ArrayList<LookupResult> results = new ArrayList<LookupResult>(completions.size());
for (Completion c : completions) {
results.add(new LookupResult(c.utf8.utf8ToString(), c.bucket));
}
return results;
}
@Override
public boolean add(String key, Object value) {
// Not supported.
return false;
}
@Override
public Float get(String key) {
Integer bucket = normalCompletion.getBucket(key);
if (bucket == null)
return null;
else
return (float) normalCompletion.getBucket(key) / normalCompletion.getBucketCount();
}
/**
* Deserialization from disk.
*/
@Override
public synchronized boolean load(File storeDir) throws IOException {
File data = new File(storeDir, FILENAME);
if (!data.exists() || !data.canRead()) {
return false;
}
this.higherWeightsCompletion = new FSTCompletion(
FST.read(data, NoOutputs.getSingleton()));
this.normalCompletion = new FSTCompletion(
higherWeightsCompletion.getFST(), false, exactMatchFirst);
return true;
}
/**
* Serialization to disk.
*/
@Override
public synchronized boolean store(File storeDir) throws IOException {
if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
return false;
}
if (this.normalCompletion == null)
return false;
normalCompletion.getFST().save(new File(storeDir, FILENAME));
return true;
}
}

View File

@ -1,570 +0,0 @@
package org.apache.lucene.search.suggest.fst;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.NoOutputs;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
/**
* Finite state automata based implementation of {@link Lookup} query
* suggestion/ autocomplete interface.
*
* <h2>Implementation details</h2>
*
* <p>The construction step in {@link #build(TermFreqIterator)} works as follows:
* <ul>
* <li>A set of input terms (String) and weights (float) is given.</li>
* <li>The range of weights is determined and then all weights are discretized into a fixed set
* of values ({@link #buckets}).
* Note that this means that minor changes in weights may be lost during automaton construction.
* In general, this is not a big problem because the "priorities" of completions can be split
* into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
* If you need exact, fine-grained weights, use {@link TSTLookup} instead.<li>
* <li>All terms in the input are preprended with a synthetic pseudo-character being the weight
* of that term. For example a term <code>abc</code> with a discretized weight equal '1' would
* become <code>1abc</code>.</li>
* <li>The terms are sorted by their raw value of utf16 character values (including the synthetic
* term in front).</li>
* <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
* arcs labeled with all possible weights. We cache all these arcs, highest-weight first.</li>
* </ul>
*
* <p>At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
* <ul>
* <li>For each possible term weight encoded in the automaton (cached arcs from the root above),
* starting with the highest one, we descend along the path of the input key. If the key is not
* a prefix of a sequence in the automaton (path ends prematurely), we exit immediately.
* No completions.
* <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire
* subautomaton (all paths) starting from this node form the key's completions.</b> We start
* the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
* suggestion to the list of results (the weight of this suggestion is constant and equal to the
* root path we started from). The tricky part is that because automaton edges are sorted and
* we scan depth-first, we can terminate the entire procedure as soon as we collect enough
* suggestions the user requested.
* <li>In case the number of suggestions collected in the step above is still insufficient,
* we proceed to the next (smaller) weight leaving the root node and repeat the same
* algorithm again.
* </li>
* </ul>
*
* <h2>Runtime behavior and performance characteristic</h2>
*
* <p>The algorithm described above is optimized for finding suggestions to short prefixes
* in a top-weights-first order. This is probably the most common use case: it allows
* presenting suggestions early and sorts them by the global frequency (and then alphabetically).
*
* <p>If there is an exact match in the automaton, it is returned first on the results
* list (even with by-weight sorting).
*
* <p>Note that the maximum lookup time for <b>any prefix</b>
* is the time of descending to the subtree, plus traversal of the subtree up to the number
* of requested suggestions (because they are already presorted by weight on the root level
* and alphabetically at any node level).
*
* <p>To order alphabetically only (no ordering by priorities), use identical term weights
* for all terms. Alphabetical suggestions are returned even if non-constant weights are
* used, but the algorithm for doing this is suboptimal.
*
* <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
* nothing else.
*
* <b>NOTE</b>: the FST file format is experimental and
* subject to suddenly change, requiring you to rebuild the
* FST suggest index.
*/
public class FSTLookup extends Lookup {
public FSTLookup() {
this(10, true);
}
public FSTLookup(int buckets, boolean exactMatchFirst) {
this.buckets = buckets;
this.exactMatchFirst = exactMatchFirst;
}
/** A structure for a single entry (for sorting/ preprocessing). */
private static class Entry {
char [] term;
float weight;
public Entry(char [] term, float freq) {
this.term = term;
this.weight = freq;
}
}
/** Serialized automaton file name (storage). */
public static final String FILENAME = "fst.dat";
/** An empty result. */
private static final List<LookupResult> EMPTY_RESULT = Collections.emptyList();
/**
* The number of separate buckets for weights (discretization). The more buckets,
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
* will not decrease for prefixes which have highly-weighted completions (because these
* are filled-in first), but will decrease significantly for low-weighted terms (but
* these should be infrequent, so it is all right).
*
* <p>The number of buckets must be within [1, 255] range.
*/
private final int buckets;
/**
* If <code>true</code>, exact suggestions are returned first, even if they are prefixes
* of other strings in the automaton (possibly with larger weights).
*/
private final boolean exactMatchFirst;
/**
* Finite state automaton encoding all the lookup terms. See class
* notes for details.
*/
private FST<Object> automaton;
/**
* An array of arcs leaving the root automaton state and encoding weights of all
* completions in their sub-trees.
*/
private Arc<Object> [] rootArcs;
/* */
@Override
public void build(TermFreqIterator tfit) throws IOException {
// Buffer the input because we will need it twice: for calculating
// weights distribution and for the actual automata building.
List<Entry> entries = new ArrayList<Entry>();
while (tfit.hasNext()) {
String term = tfit.next();
char [] termChars = new char [term.length() + 1]; // add padding for weight.
for (int i = 0; i < term.length(); i++)
termChars[i + 1] = term.charAt(i);
entries.add(new Entry(termChars, tfit.freq()));
}
// Distribute weights into at most N buckets. This is a form of discretization to
// limit the number of possible weights so that they can be efficiently encoded in the
// automaton.
//
// It is assumed the distribution of weights is _linear_ so proportional division
// of [min, max] range will be enough here. Other approaches could be to sort
// weights and divide into proportional ranges.
if (entries.size() > 0) {
redistributeWeightsProportionalMinMax(entries, buckets);
encodeWeightPrefix(entries);
}
// Build the automaton (includes input sorting) and cache root arcs in order from the highest,
// to the lowest weight.
this.automaton = buildAutomaton(entries);
cacheRootArcs();
}
/**
* Cache the root node's output arcs starting with completions with the highest weights.
*/
@SuppressWarnings("unchecked")
private void cacheRootArcs() throws IOException {
if (automaton != null) {
List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>();
Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
automaton.readFirstTargetArc(arc, arc);
while (true) {
rootArcs.add(new Arc<Object>().copyFrom(arc));
if (arc.isLast())
break;
automaton.readNextArc(arc);
}
Collections.reverse(rootArcs); // we want highest weights first.
this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]);
}
}
/**
* Not implemented.
*/
@Override
public boolean add(String key, Object value) {
// This implementation does not support ad-hoc additions (all input
// must be sorted for the builder).
return false;
}
/**
* Get the (approximated) weight of a single key (if there is a perfect match
* for it in the automaton).
*
* @return Returns the approximated weight of the input key or <code>null</code>
* if not found.
*/
@Override
public Float get(String key) {
return getExactMatchStartingFromRootArc(0, key);
}
/**
* Returns the first exact match by traversing root arcs, starting from
* the arc <code>i</code>.
*
* @param i The first root arc index in {@link #rootArcs} to consider when
* matching.
*/
private Float getExactMatchStartingFromRootArc(int i, String key) {
// Get the UTF-8 bytes representation of the input key.
try {
final FST.Arc<Object> scratch = new FST.Arc<Object>();
for (; i < rootArcs.length; i++) {
final FST.Arc<Object> rootArc = rootArcs[i];
final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
// Descend into the automaton using the key as prefix.
if (descendWithPrefix(arc, key)) {
automaton.readFirstTargetArc(arc, arc);
if (arc.label == FST.END_LABEL) {
// Prefix-encoded weight.
return rootArc.label / (float) buckets;
}
}
}
} catch (IOException e) {
// Should never happen, but anyway.
throw new RuntimeException(e);
}
return null;
}
/**
* Lookup autocomplete suggestions to <code>key</code>.
*
* @param key The prefix to which suggestions should be sought.
* @param onlyMorePopular Return most popular suggestions first. This is the default
* behavior for this implementation. Setting it to <code>false</code> has no effect (use
* constant term weights to sort alphabetically only).
* @param num At most this number of suggestions will be returned.
* @return Returns the suggestions, sorted by their approximated weight first (decreasing)
* and then alphabetically (utf16 codepoint order).
*/
@Override
public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) {
if (key.length() == 0 || automaton == null) {
// Keep the result an ArrayList to keep calls monomorphic.
return EMPTY_RESULT;
}
try {
if (!onlyMorePopular && rootArcs.length > 1) {
// We could emit a warning here (?). An optimal strategy for alphabetically sorted
// suggestions would be to add them with a constant weight -- this saves unnecessary
// traversals and sorting.
return lookupSortedAlphabetically(key, num);
} else {
return lookupSortedByWeight(key, num, false);
}
} catch (IOException e) {
// Should never happen, but anyway.
throw new RuntimeException(e);
}
}
/**
* Lookup suggestions sorted alphabetically <b>if weights are not constant</b>. This
* is a workaround: in general, use constant weights for alphabetically sorted result.
*/
private List<LookupResult> lookupSortedAlphabetically(String key, int num) throws IOException {
// Greedily get num results from each weight branch.
List<LookupResult> res = lookupSortedByWeight(key, num, true);
// Sort and trim.
Collections.sort(res, new Comparator<LookupResult>() {
// not till java6 @Override
public int compare(LookupResult o1, LookupResult o2) {
return o1.key.compareTo(o2.key);
}
});
if (res.size() > num) {
res = res.subList(0, num);
}
return res;
}
/**
* Lookup suggestions sorted by weight (descending order).
*
* @param collectAll If <code>true</code>, the routine terminates immediately when <code>num</code>
* suggestions have been collected. If <code>false</code>, it will collect suggestions from
* all weight arcs (needed for {@link #lookupSortedAlphabetically}.
*/
private ArrayList<LookupResult> lookupSortedByWeight(String key, int num, boolean collectAll) throws IOException {
// Don't overallocate the results buffers. This also serves the purpose of allowing
// the user of this class to request all matches using Integer.MAX_VALUE as the number
// of results.
final ArrayList<LookupResult> res = new ArrayList<LookupResult>(Math.min(10, num));
final StringBuilder output = new StringBuilder(key);
final int matchLength = key.length() - 1;
for (int i = 0; i < rootArcs.length; i++) {
final FST.Arc<Object> rootArc = rootArcs[i];
final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
// Descend into the automaton using the key as prefix.
if (descendWithPrefix(arc, key)) {
// Prefix-encoded weight.
final float weight = rootArc.label / (float) buckets;
// A subgraph starting from the current node has the completions
// of the key prefix. The arc we're at is the last key's byte,
// so we will collect it too.
output.setLength(matchLength);
if (collect(res, num, weight, output, arc) && !collectAll) {
// We have enough suggestions to return immediately. Keep on looking for an
// exact match, if requested.
if (exactMatchFirst) {
if (!checkExistingAndReorder(res, key)) {
Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key);
if (exactMatchWeight != null) {
// Insert as the first result and truncate at num.
while (res.size() >= num) {
res.remove(res.size() - 1);
}
res.add(0, new LookupResult(key, exactMatchWeight));
}
}
}
break;
}
}
}
return res;
}
/**
* Checks if the list of {@link LookupResult}s already has a <code>key</code>. If so,
* reorders that {@link LookupResult} to the first position.
*
* @return Returns <code>true<code> if and only if <code>list</code> contained <code>key</code>.
*/
private boolean checkExistingAndReorder(ArrayList<LookupResult> list, String key) {
// We assume list does not have duplicates (because of how the FST is created).
for (int i = list.size(); --i >= 0;) {
if (key.equals(list.get(i).key)) {
// Key found. Unless already at i==0, remove it and push up front so that the ordering
// remains identical with the exception of the exact match.
list.add(0, list.remove(i));
return true;
}
}
return false;
}
/**
* Descend along the path starting at <code>arc</code> and going through
* bytes in <code>utf8</code> argument.
*
* @param arc The starting arc. This argument is modified in-place.
* @param term The term to descend with.
* @return If <code>true</code>, <code>arc</code> will be set to the arc matching
* last byte of <code>utf8</code>. <code>false</code> is returned if no such
* prefix <code>utf8</code> exists.
*/
private boolean descendWithPrefix(Arc<Object> arc, String term) throws IOException {
final int max = term.length();
for (int i = 0; i < max; i++) {
if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) {
// No matching prefixes, return an empty result.
return false;
}
}
return true;
}
/**
* Recursive collect lookup results from the automaton subgraph starting at <code>arc</code>.
*
* @param num Maximum number of results needed (early termination).
* @param weight Weight of all results found during this collection.
*/
private boolean collect(List<LookupResult> res, int num, float weight, StringBuilder output, Arc<Object> arc) throws IOException {
output.append((char) arc.label);
automaton.readFirstTargetArc(arc, arc);
while (true) {
if (arc.label == FST.END_LABEL) {
res.add(new LookupResult(output.toString(), weight));
if (res.size() >= num)
return true;
} else {
int save = output.length();
if (collect(res, num, weight, output, new Arc<Object>().copyFrom(arc))) {
return true;
}
output.setLength(save);
}
if (arc.isLast()) {
break;
}
automaton.readNextArc(arc);
}
return false;
}
/**
* Builds the final automaton from a list of entries.
*/
private FST<Object> buildAutomaton(List<Entry> entries) throws IOException {
if (entries.size() == 0)
return null;
// Sort by utf16 (raw char value)
final Comparator<Entry> comp = new Comparator<Entry>() {
public int compare(Entry o1, Entry o2) {
char [] ch1 = o1.term;
char [] ch2 = o2.term;
int len1 = ch1.length;
int len2 = ch2.length;
int max = Math.min(len1, len2);
for (int i = 0; i < max; i++) {
int v = ch1[i] - ch2[i];
if (v != 0) return v;
}
return len1 - len2;
}
};
Collections.sort(entries, comp);
// Avoid duplicated identical entries, if possible. This is required because
// it breaks automaton construction otherwise.
int len = entries.size();
int j = 0;
for (int i = 1; i < len; i++) {
if (comp.compare(entries.get(j), entries.get(i)) != 0) {
entries.set(++j, entries.get(i));
}
}
entries = entries.subList(0, j + 1);
// Build the automaton.
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Object empty = outputs.getNoOutput();
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE4, outputs);
final IntsRef scratchIntsRef = new IntsRef(10);
for (Entry e : entries) {
final int termLength = scratchIntsRef.length = e.term.length;
scratchIntsRef.grow(termLength);
final int [] ints = scratchIntsRef.ints;
final char [] chars = e.term;
for (int i = termLength; --i >= 0;) {
ints[i] = chars[i];
}
builder.add(scratchIntsRef, empty);
}
return builder.finish();
}
/**
* Prepends the entry's weight to each entry, encoded as a single byte, so that the
* root automaton node fans out to all possible priorities, starting with the arc that has
* the highest weights.
*/
private void encodeWeightPrefix(List<Entry> entries) {
for (Entry e : entries) {
int weight = (int) e.weight;
assert (weight >= 0 && weight <= buckets) :
"Weight out of range: " + weight + " [" + buckets + "]";
// There should be a single empty char reserved in front for the weight.
e.term[0] = (char) weight;
}
}
/**
* Split [min, max] range into buckets, reassigning weights. Entries' weights are
* remapped to [0, buckets] range (so, buckets + 1 buckets, actually).
*/
private void redistributeWeightsProportionalMinMax(List<Entry> entries, int buckets) {
float min = entries.get(0).weight;
float max = min;
for (Entry e : entries) {
min = Math.min(e.weight, min);
max = Math.max(e.weight, max);
}
final float range = max - min;
for (Entry e : entries) {
e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor()
}
}
/**
* Deserialization from disk.
*/
@Override
public synchronized boolean load(File storeDir) throws IOException {
File data = new File(storeDir, FILENAME);
if (!data.exists() || !data.canRead()) {
return false;
}
InputStream is = new BufferedInputStream(new FileInputStream(data));
try {
this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton());
cacheRootArcs();
} finally {
IOUtils.close(is);
}
return true;
}
/**
* Serialization to disk.
*/
@Override
public synchronized boolean store(File storeDir) throws IOException {
if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
return false;
}
if (this.automaton == null)
return false;
File data = new File(storeDir, FILENAME);
OutputStream os = new BufferedOutputStream(new FileOutputStream(data));
try {
this.automaton.save(new OutputStreamDataOutput(os));
} finally {
IOUtils.close(os);
}
return true;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.search.suggest.fst;
import org.apache.lucene.util.NumericUtils;
/**
* Converts normalized float representations ({@link Float#floatToIntBits(float)})
* into integers that are directly sortable in int4 representation (or unsigned values or
* after promoting to a long with higher 32-bits zeroed).
*/
class FloatMagic {
/**
* Convert a float to a directly sortable unsigned integer. For sortable signed
* integers, see {@link NumericUtils#floatToSortableInt(float)}.
*/
public static int toSortable(float f) {
return floatBitsToUnsignedOrdered(Float.floatToRawIntBits(f));
}
/**
* Back from {@link #toSortable(float)} to float.
*/
public static float fromSortable(int v) {
return Float.intBitsToFloat(unsignedOrderedToFloatBits(v));
}
/**
* Convert float bits to directly sortable bits.
* Normalizes all NaNs to canonical form.
*/
static int floatBitsToUnsignedOrdered(int v) {
// Canonicalize NaN ranges. I assume this check will be faster here than
// (v == v) == false on the FPU? We don't distinguish between different
// flavors of NaNs here (see http://en.wikipedia.org/wiki/NaN). I guess
// in Java this doesn't matter much anyway.
if ((v & 0x7fffffff) > 0x7f800000) {
// Apply the logic below to a canonical "quiet NaN"
return 0x7fc00000 ^ 0x80000000;
}
if (v < 0) {
// Reverse the order of negative values and push them before positive values.
return ~v;
} else {
// Shift positive values after negative, but before NaNs, they're sorted already.
return v ^ 0x80000000;
}
}
/**
* Back from {@link #floatBitsToUnsignedOrdered(int)}.
*/
static int unsignedOrderedToFloatBits(int v) {
if (v < 0)
return v & ~0x80000000;
else
return ~v;
}
}

View File

@ -0,0 +1,28 @@
package org.apache.lucene.search.suggest.fst;
import java.util.*;
import org.apache.lucene.util.BytesRef;
/**
* An {@link BytesRefSorter} that keeps all the entries in memory.
*/
public final class InMemorySorter implements BytesRefSorter {
// TODO: use a single byte[] to back up all entries?
private final ArrayList<BytesRef> refs = new ArrayList<BytesRef>();
private boolean closed = false;
@Override
public void add(BytesRef utf8) {
if (closed) throw new IllegalStateException();
refs.add(BytesRef.deepCopyOf(utf8));
}
@Override
public Iterator<BytesRef> iterator() {
closed = true;
Collections.sort(refs, BytesRef.getUTF8SortedAsUnicodeComparator());
return Collections.unmodifiableCollection(refs).iterator();
}
}

View File

@ -0,0 +1,483 @@
package org.apache.lucene.search.suggest.fst;
import java.io.*;
import java.util.*;
import org.apache.lucene.util.*;
import org.apache.lucene.util.PriorityQueue;
// TODO: the buffer is currently byte[][] which with very small arrays will terribly overallocate
// memory (alignments) and make GC very happy.
//
// We could move it to a single byte[] + and use custom sorting, but we'd need to check if this
// yields any improvement first.
/**
* On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following
* fields:
* <ul>
* <li>(two bytes) length of the following byte array,
* <li>exactly the above count of bytes for the sequence to be sorted.
* </ul>
*
* @see #sort(File, File)
*/
public final class Sort {
public final static int MB = 1024 * 1024;
public final static int GB = MB * 1024;
/**
* Minimum recommended buffer size for sorting.
*/
public final static int MIN_BUFFER_SIZE_MB = 32;
/**
* Maximum number of temporary files before doing an intermediate merge.
*/
public final static int MAX_TEMPFILES = 128;
/**
* Minimum slot buffer expansion.
*/
private final static int MIN_EXPECTED_GROWTH = 1000;
/**
* A bit more descriptive unit for constructors.
*
* @see #automatic()
* @see #megabytes(int)
*/
public static final class BufferSize {
final int bytes;
private BufferSize(long bytes) {
if (bytes > Integer.MAX_VALUE) {
throw new IllegalArgumentException("Buffer too large for Java ("
+ (Integer.MAX_VALUE / MB) + "mb max): " + bytes);
}
this.bytes = (int) bytes;
}
public static BufferSize megabytes(int mb) {
return new BufferSize(mb * MB);
}
/**
* Approximately half of the currently available free heap, but no less
* than {@link #MIN_BUFFER_SIZE_MB}.
*/
public static BufferSize automatic() {
long freeHeap = Runtime.getRuntime().freeMemory();
return new BufferSize(Math.min(MIN_BUFFER_SIZE_MB * MB, freeHeap / 2));
}
}
/**
* byte[] in unsigned byte order.
*/
static final Comparator<byte[]> unsignedByteOrderComparator = new Comparator<byte[]>() {
public int compare(byte[] left, byte[] right) {
final int max = Math.min(left.length, right.length);
for (int i = 0, j = 0; i < max; i++, j++) {
int diff = (left[i] & 0xff) - (right[j] & 0xff);
if (diff != 0)
return diff;
}
return left.length - right.length;
}
};
/**
* Sort info (debugging mostly).
*/
public class SortInfo {
public int tempMergeFiles;
public int mergeRounds;
public int lines;
public long mergeTime;
public long sortTime;
public long totalTime;
public long readTime;
public final long bufferSize = ramBufferSize.bytes;
@Override
public String toString() {
return String.format(Locale.ENGLISH,
"time=%.2f sec. total (%.2f reading, %.2f sorting, %.2f merging), lines=%d, temp files=%d, merges=%d, soft ram limit=%.2f MB",
totalTime / 1000.0d, readTime / 1000.0d, sortTime / 1000.0d, mergeTime / 1000.0d,
lines, tempMergeFiles, mergeRounds,
(double) bufferSize / MB);
}
}
private final static byte [][] EMPTY = new byte [0][];
private final BufferSize ramBufferSize;
private final File tempDirectory;
private byte [][] buffer = new byte [0][];
private SortInfo sortInfo;
private int maxTempFiles;
/**
* Defaults constructor.
*
* @see #defaultTempDir()
* @see BufferSize#automatic()
*/
public Sort() throws IOException {
this(BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
}
/**
* All-details constructor.
*/
public Sort(BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) {
if (ramBufferSize.bytes < 1024 * 1024 / 2) {
// Half-meg buffer is the absolute minimum.
throw new IllegalArgumentException("At least 0.5MB RAM buffer is needed: "
+ ramBufferSize.bytes);
}
if (maxTempfiles < 2) {
throw new IllegalArgumentException("maxTempFiles must be >= 2");
}
this.ramBufferSize = ramBufferSize;
this.tempDirectory = tempDirectory;
this.maxTempFiles = maxTempfiles;
}
/**
* Sort input to output, explicit hint for the buffer size. The amount of allocated
* memory may deviate from the hint (may be smaller or larger).
*/
public SortInfo sort(File input, File output) throws IOException {
sortInfo = new SortInfo();
sortInfo.totalTime = System.currentTimeMillis();
output.delete();
ArrayList<File> merges = new ArrayList<File>();
ByteSequencesReader is = new ByteSequencesReader(input);
boolean success = false;
try {
int lines = 0;
while ((lines = readPartition(is)) > 0) {
merges.add(sortPartition(lines));
sortInfo.tempMergeFiles++;
sortInfo.lines += lines;
// Handle intermediate merges.
if (merges.size() == maxTempFiles) {
File intermediate = File.createTempFile("sort", "intermediate", tempDirectory);
mergePartitions(merges, intermediate);
for (File file : merges) {
file.delete();
}
merges.clear();
merges.add(intermediate);
sortInfo.tempMergeFiles++;
}
}
success = true;
} finally {
if (success)
IOUtils.close(is);
else
IOUtils.closeWhileHandlingException(is);
}
// One partition, try to rename or copy if unsuccessful.
if (merges.size() == 1) {
// If simple rename doesn't work this means the output is
// on a different volume or something. Copy the input then.
if (!merges.get(0).renameTo(output)) {
copy(merges.get(0), output);
}
} else {
// otherwise merge the partitions with a priority queue.
mergePartitions(merges, output);
for (File file : merges) {
file.delete();
}
}
sortInfo.totalTime = (System.currentTimeMillis() - sortInfo.totalTime);
return sortInfo;
}
/**
* Returns the default temporary directory. By default, java.io.tmpdir. If not accessible
* or not available, an IOException is thrown
*/
public static File defaultTempDir() throws IOException {
String tempDirPath = System.getProperty("java.io.tmpdir");
if (tempDirPath == null)
throw new IOException("Java has no temporary folder property (java.io.tmpdir)?");
File tempDirectory = new File(tempDirPath);
if (!tempDirectory.exists() || !tempDirectory.canWrite()) {
throw new IOException("Java's temporary folder not present or writeable?: "
+ tempDirectory.getAbsolutePath());
}
return tempDirectory;
}
/**
* Copies one file to another.
*/
private static void copy(File file, File output) throws IOException {
// 64kb copy buffer (empirical pick).
byte [] buffer = new byte [16 * 1024];
InputStream is = null;
OutputStream os = null;
try {
is = new FileInputStream(file);
os = new FileOutputStream(output);
int length;
while ((length = is.read(buffer)) > 0) {
os.write(buffer, 0, length);
}
} finally {
IOUtils.close(is, os);
}
}
/** Sort a single partition in-memory. */
protected File sortPartition(int len) throws IOException {
byte [][] data = this.buffer;
File tempFile = File.createTempFile("sort", "partition", tempDirectory);
long start = System.currentTimeMillis();
Arrays.sort(data, 0, len, unsignedByteOrderComparator);
sortInfo.sortTime += (System.currentTimeMillis() - start);
ByteSequencesWriter out = new ByteSequencesWriter(tempFile);
try {
for (int i = 0; i < len; i++) {
assert data[i].length <= Short.MAX_VALUE;
out.write(data[i]);
}
out.close();
// Clean up the buffer for the next partition.
this.buffer = EMPTY;
return tempFile;
} finally {
IOUtils.close(out);
}
}
/** Merge a list of sorted temporary files (partitions) into an output file */
void mergePartitions(List<File> merges, File outputFile) throws IOException {
long start = System.currentTimeMillis();
ByteSequencesWriter out = new ByteSequencesWriter(outputFile);
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(merges.size()) {
protected boolean lessThan(FileAndTop a, FileAndTop b) {
return a.current.compareTo(b.current) < 0;
}
};
ByteSequencesReader [] streams = new ByteSequencesReader [merges.size()];
try {
// Open streams and read the top for each file
for (int i = 0; i < merges.size(); i++) {
streams[i] = new ByteSequencesReader(merges.get(i));
byte line[] = streams[i].read();
if (line != null) {
queue.insertWithOverflow(new FileAndTop(i, line));
}
}
// Unix utility sort() uses ordered array of files to pick the next line from, updating
// it as it reads new lines. The PQ used here is a more elegant solution and has
// a nicer theoretical complexity bound :) The entire sorting process is I/O bound anyway
// so it shouldn't make much of a difference (didn't check).
FileAndTop top;
while ((top = queue.top()) != null) {
out.write(top.current);
if (!streams[top.fd].read(top.current)) {
queue.pop();
} else {
queue.updateTop();
}
}
sortInfo.mergeTime += System.currentTimeMillis() - start;
sortInfo.mergeRounds++;
} finally {
// The logic below is: if an exception occurs in closing out, it has a priority over exceptions
// happening in closing streams.
try {
IOUtils.close(streams);
} finally {
IOUtils.close(out);
}
}
}
/** Read in a single partition of data */
int readPartition(ByteSequencesReader reader) throws IOException {
long start = System.currentTimeMillis();
// We will be reallocating from scratch.
Arrays.fill(this.buffer, null);
int bytesLimit = this.ramBufferSize.bytes;
byte [][] data = this.buffer;
byte[] line;
int linesRead = 0;
while ((line = reader.read()) != null) {
if (linesRead + 1 >= data.length) {
data = Arrays.copyOf(data,
ArrayUtil.oversize(linesRead + MIN_EXPECTED_GROWTH,
RamUsageEstimator.NUM_BYTES_OBJECT_REF));
}
data[linesRead++] = line;
// Account for the created objects.
// (buffer slots do not account to buffer size.)
bytesLimit -= line.length + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER;
if (bytesLimit < 0) {
break;
}
}
this.buffer = data;
sortInfo.readTime += (System.currentTimeMillis() - start);
return linesRead;
}
static class FileAndTop {
final int fd;
final BytesRef current;
FileAndTop(int fd, byte [] firstLine) {
this.fd = fd;
this.current = new BytesRef(firstLine);
}
}
/**
* Utility class to emit length-prefixed byte[] entries to an output stream for sorting.
* Complementary to {@link ByteSequencesReader}.
*/
public static class ByteSequencesWriter implements Closeable {
private final DataOutput os;
public ByteSequencesWriter(File file) throws IOException {
this(new DataOutputStream(
new BufferedOutputStream(
new FileOutputStream(file))));
}
public ByteSequencesWriter(DataOutput os) {
this.os = os;
}
public void write(BytesRef ref) throws IOException {
assert ref != null;
write(ref.bytes, ref.offset, ref.length);
}
public void write(byte [] bytes) throws IOException {
write(bytes, 0, bytes.length);
}
public void write(byte [] bytes, int off, int len) throws IOException {
assert bytes != null;
assert off >= 0 && off + len <= bytes.length;
assert len >= 0;
os.writeShort(len);
os.write(bytes, off, len);
}
/**
* Closes the provided {@link DataOutput} if it is {@link Closeable}.
*/
@Override
public void close() throws IOException {
if (os instanceof Closeable) {
((Closeable) os).close();
}
}
}
/**
* Utility class to read length-prefixed byte[] entries from an input.
* Complementary to {@link ByteSequencesWriter}.
*/
public static class ByteSequencesReader implements Closeable {
private final DataInput is;
public ByteSequencesReader(File file) throws IOException {
this(new DataInputStream(
new BufferedInputStream(
new FileInputStream(file))));
}
public ByteSequencesReader(DataInput is) {
this.is = is;
}
/**
* Reads the next entry into the provided {@link BytesRef}. The internal
* storage is resized if needed.
*
* @return Returns <code>false</code> if EOF occurred when trying to read
* the header of the next sequence. Returns <code>true</code> otherwise.
* @throws EOFException if the file ends before the full sequence is read.
*/
public boolean read(BytesRef ref) throws IOException {
short length;
try {
length = is.readShort();
} catch (EOFException e) {
return false;
}
ref.grow(length);
ref.offset = 0;
ref.length = length;
is.readFully(ref.bytes, 0, length);
return true;
}
/**
* Reads the next entry and returns it if successful.
*
* @see #read(BytesRef)
*
* @return Returns <code>null</code> if EOF occurred before the next entry
* could be read.
* @throws EOFException if the file ends before the full sequence is read.
*/
public byte[] read() throws IOException {
short length;
try {
length = is.readShort();
} catch (EOFException e) {
return null;
}
assert length >= 0 : "Sanity: sequence length < 0: " + length;
byte [] result = new byte [length];
is.readFully(result);
return result;
}
/**
* Closes the provided {@link DataInput} if it is {@link Closeable}.
*/
@Override
public void close() throws IOException {
if (is instanceof Closeable) {
((Closeable) is).close();
}
}
}
}

View File

@ -29,10 +29,9 @@ import java.util.Locale;
import java.util.Random;
import java.util.concurrent.Callable;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.*;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.FSTLookup;
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
import org.apache.lucene.search.suggest.tst.TSTLookup;
@ -48,7 +47,7 @@ public class LookupBenchmarkTest extends LuceneTestCase {
private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(
JaspellLookup.class,
TSTLookup.class,
FSTLookup.class);
FSTCompletionLookup.class);
private final static int rounds = 15;
private final static int warmup = 5;

View File

@ -19,7 +19,7 @@ package org.apache.lucene.search.suggest;
import java.io.File;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.FSTLookup;
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.util.LuceneTestCase;
@ -51,9 +51,9 @@ public class PersistenceTest extends LuceneTestCase {
}
public void testFSTPersistence() throws Exception {
runTest(FSTLookup.class, false);
runTest(FSTCompletionLookup.class, false);
}
private void runTest(Class<? extends Lookup> lookupClass,
boolean supportsExactWeights) throws Exception {

View File

@ -0,0 +1,44 @@
package org.apache.lucene.search.suggest.fst;
import java.util.Iterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class BytesRefSortersTest extends LuceneTestCase {
@Test
public void testExternalRefSorter() throws Exception {
check(new ExternalRefSorter(new Sort()));
}
@Test
public void testInMemorySorter() throws Exception {
check(new InMemorySorter());
}
private void check(BytesRefSorter sorter) throws Exception {
for (int i = 0; i < 100; i++) {
byte [] current = new byte [random.nextInt(256)];
random.nextBytes(current);
sorter.add(new BytesRef(current));
}
// Create two iterators and check that they're aligned with each other.
Iterator<BytesRef> i1 = sorter.iterator();
Iterator<BytesRef> i2 = sorter.iterator();
// Verify sorter contract.
try {
sorter.add(new BytesRef(new byte [1]));
fail("expected contract violation.");
} catch (IllegalStateException e) {
// Expected.
}
while (i1.hasNext() && i2.hasNext()) {
assertEquals(i1.next(), i2.next());
}
assertEquals(i1.hasNext(), i2.hasNext());
}
}

View File

@ -17,40 +17,38 @@ package org.apache.lucene.search.suggest.fst;
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import java.util.*;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.fst.FSTLookup;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.search.suggest.LookupBenchmarkTest;
import org.apache.lucene.search.suggest.TermFreq;
import org.apache.lucene.search.suggest.TermFreqArrayIterator;
import org.apache.lucene.search.suggest.*;
import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
import org.apache.lucene.util.*;
/**
* Unit tests for {@link FSTLookup}.
* Unit tests for {@link FSTCompletion}.
*/
public class FSTLookupTest extends LuceneTestCase {
public class FSTCompletionTest extends LuceneTestCase {
public static TermFreq tf(String t, float v) {
return new TermFreq(t, v);
}
private FSTLookup lookup;
private FSTCompletion completion;
private FSTCompletion completionAlphabetical;
public void setUp() throws Exception {
super.setUp();
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(evalKeys()));
FSTCompletionBuilder builder = new FSTCompletionBuilder();
for (TermFreq tf : evalKeys()) {
builder.add(new BytesRef(tf.term), (int) tf.v);
}
completion = builder.build();
completionAlphabetical = new FSTCompletion(completion.getFST(), false, true);
}
private TermFreq[] evalKeys() {
final TermFreq[] keys = new TermFreq[] {
tf("one", 0.5f),
tf("one", 0),
tf("oneness", 1),
tf("onerous", 1),
tf("onesimus", 1),
@ -64,103 +62,152 @@ public class FSTLookupTest extends LuceneTestCase {
tf("foundation", 1),
tf("fourblah", 1),
tf("fourteen", 1),
tf("four", 0.5f),
tf("fourier", 0.5f),
tf("fourty", 0.5f),
tf("four", 0f),
tf("fourier", 0f),
tf("fourty", 0f),
tf("xo", 1),
};
return keys;
}
public void testExactMatchHighPriority() throws Exception {
assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0");
assertMatchEquals(completion.lookup("two", 1),
"two/1.0");
}
public void testExactMatchLowPriority() throws Exception {
assertMatchEquals(lookup.lookup("one", true, 2),
assertMatchEquals(completion.lookup("one", 2),
"one/0.0",
"oneness/1.0");
}
public void testExactMatchReordering() throws Exception {
// Check reordering of exact matches.
assertMatchEquals(completion.lookup("four", 4),
"four/0.0",
"fourblah/1.0",
"fourteen/1.0",
"fourier/0.0");
}
public void testRequestedCount() throws Exception {
// 'one' is promoted after collecting two higher ranking results.
assertMatchEquals(lookup.lookup("one", true, 2),
"one/0.0",
"oneness/1.0");
// 'one' is at the top after collecting all alphabetical results.
assertMatchEquals(lookup.lookup("one", false, 2),
assertMatchEquals(completion.lookup("one", 2),
"one/0.0",
"oneness/1.0");
// 'four' is collected in a bucket and then again as an exact match.
assertMatchEquals(lookup.lookup("four", true, 2),
assertMatchEquals(completion.lookup("four", 2),
"four/0.0",
"fourblah/1.0");
// Check reordering of exact matches.
assertMatchEquals(lookup.lookup("four", true, 4),
assertMatchEquals(completion.lookup("four", 4),
"four/0.0",
"fourblah/1.0",
"fourteen/1.0",
"fourier/0.0");
lookup = new FSTLookup(10, false);
lookup.build(new TermFreqArrayIterator(evalKeys()));
// 'one' is at the top after collecting all alphabetical results.
assertMatchEquals(completionAlphabetical.lookup("one", 2),
"one/0.0",
"oneness/1.0");
// 'one' is not promoted after collecting two higher ranking results.
assertMatchEquals(lookup.lookup("one", true, 2),
FSTCompletion noPromotion = new FSTCompletion(completion.getFST(), true, false);
assertMatchEquals(noPromotion.lookup("one", 2),
"oneness/1.0",
"onerous/1.0");
// 'one' is at the top after collecting all alphabetical results.
assertMatchEquals(lookup.lookup("one", false, 2),
assertMatchEquals(completionAlphabetical.lookup("one", 2),
"one/0.0",
"oneness/1.0");
}
public void testMiss() throws Exception {
assertMatchEquals(lookup.lookup("xyz", true, 1));
assertMatchEquals(completion.lookup("xyz", 1));
}
public void testAlphabeticWithWeights() throws Exception {
assertEquals(0, lookup.lookup("xyz", false, 1).size());
assertEquals(0, completionAlphabetical.lookup("xyz", 1).size());
}
public void testFullMatchList() throws Exception {
assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE),
assertMatchEquals(completion.lookup("one", Integer.MAX_VALUE),
"oneness/1.0",
"onerous/1.0",
"onesimus/1.0",
"one/0.0");
}
public void testThreeByte() throws Exception {
String key = new String(new byte[] {
(byte) 0xF0, (byte) 0xA4, (byte) 0xAD, (byte) 0xA2}, "UTF-8");
FSTCompletionBuilder builder = new FSTCompletionBuilder();
builder.add(new BytesRef(key), 0);
FSTCompletion lookup = builder.build();
List<Completion> result = lookup.lookup(key, 1);
assertEquals(1, result.size());
}
public void testLargeInputConstantWeights() throws Exception {
FSTCompletionLookup lookup = new FSTCompletionLookup(10, true);
Random r = random;
List<TermFreq> keys = new ArrayList<TermFreq>();
for (int i = 0; i < 5000; i++) {
keys.add(new TermFreq(_TestUtil.randomSimpleString(r), -1.0f));
}
lookup.build(new TermFreqArrayIterator(keys));
// All the weights were constant, so all returned buckets must be constant, whatever they
// are.
Float previous = null;
for (TermFreq tf : keys) {
Float current = lookup.get(tf.term);
if (previous != null) {
assertEquals(previous, current);
}
previous = current;
}
}
@Nightly
public void testMultilingualInput() throws Exception {
List<TermFreq> input = LookupBenchmarkTest.readTop50KWiki();
lookup = new FSTLookup();
FSTCompletionLookup lookup = new FSTCompletionLookup();
lookup.build(new TermFreqArrayIterator(input));
for (TermFreq tf : input) {
assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null);
assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key);
}
List<LookupResult> result = lookup.lookup("wit", true, 5);
assertEquals(5, result.size());
assertTrue(result.get(0).key.equals("wit")); // exact match.
assertTrue(result.get(1).key.equals("with")); // highest count.
}
public void testEmptyInput() throws Exception {
lookup = new FSTLookup();
lookup.build(new TermFreqArrayIterator(new TermFreq[0]));
assertMatchEquals(lookup.lookup("", true, 10));
completion = new FSTCompletionBuilder().build();
assertMatchEquals(completion.lookup("", 10));
}
@Nightly
public void testRandom() throws Exception {
List<TermFreq> freqs = new ArrayList<TermFreq>();
Random rnd = random;
for (int i = 0; i < 5000; i++) {
freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100)));
for (int i = 0; i < 2500 + rnd.nextInt(2500); i++) {
float weight = rnd.nextFloat() * 100;
freqs.add(new TermFreq("" + rnd.nextLong(), weight));
}
lookup = new FSTLookup();
FSTCompletionLookup lookup = new FSTCompletionLookup();
lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()])));
for (TermFreq tf : freqs) {
@ -174,12 +221,13 @@ public class FSTLookupTest extends LuceneTestCase {
}
}
private void assertMatchEquals(List<LookupResult> res, String... expected) {
private void assertMatchEquals(List<Completion> res, String... expected) {
String [] result = new String [res.size()];
for (int i = 0; i < res.size(); i++)
for (int i = 0; i < res.size(); i++) {
result[i] = res.get(i).toString();
if (!Arrays.equals(expected, result)) {
}
if (!Arrays.equals(stripScore(expected), stripScore(result))) {
int colLen = Math.max(maxLen(expected), maxLen(result));
StringBuilder b = new StringBuilder();
@ -196,6 +244,14 @@ public class FSTLookupTest extends LuceneTestCase {
}
}
private String[] stripScore(String[] expected) {
String [] result = new String [expected.length];
for (int i = 0; i < result.length; i++) {
result[i] = expected[i].replaceAll("\\/[0-9\\.]+", "");
}
return result;
}
private int maxLen(String[] result) {
int len = 0;
for (String s : result)

View File

@ -0,0 +1,119 @@
package org.apache.lucene.search.suggest.fst;
import java.util.*;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.NumericUtils;
import org.junit.Ignore;
import org.junit.Test;
public class FloatMagicTest extends LuceneTestCase {
public void testFloatMagic() {
ArrayList<Float> floats = new ArrayList<Float>(Arrays.asList(new Float [] {
Float.intBitsToFloat(0x7f800001), // NaN (invalid combination).
Float.intBitsToFloat(0x7fffffff), // NaN (invalid combination).
Float.intBitsToFloat(0xff800001), // NaN (invalid combination).
Float.intBitsToFloat(0xffffffff), // NaN (invalid combination).
Float.POSITIVE_INFINITY,
Float.MAX_VALUE,
100f,
0f,
0.1f,
Float.MIN_VALUE,
Float.NaN,
-0.0f,
-Float.MIN_VALUE,
-0.1f,
-1f,
-10f,
Float.NEGATIVE_INFINITY }));
// Sort them using juc.
Collections.sort(floats);
// Convert to sortable int4 representation (as long to have an unsigned sort).
long [] int4 = new long [floats.size()];
for (int i = 0; i < floats.size(); i++) {
int4[i] = FloatMagic.toSortable(floats.get(i)) & 0xffffffffL;
System.out.println(
String.format("raw %8s sortable %8s %8s numutils %8s %s",
Integer.toHexString(Float.floatToRawIntBits(floats.get(i))),
Integer.toHexString(FloatMagic.toSortable(floats.get(i))),
Integer.toHexString(FloatMagic.unsignedOrderedToFloatBits(FloatMagic.toSortable(floats.get(i)))),
Integer.toHexString(NumericUtils.floatToSortableInt(floats.get(i))),
floats.get(i)));
}
// Sort and compare. Should be identical order.
Arrays.sort(int4);
ArrayList<Float> backFromFixed = new ArrayList<Float>();
for (int i = 0; i < int4.length; i++) {
backFromFixed.add(FloatMagic.fromSortable((int) int4[i]));
}
for (int i = 0; i < int4.length; i++) {
System.out.println(
floats.get(i) + " " + FloatMagic.fromSortable((int) int4[i]));
}
assertEquals(floats, backFromFixed);
}
@Ignore("Once checked, valid forever?") @Test
public void testRoundTripFullRange() {
int i = 0;
do {
float f = Float.intBitsToFloat(i);
float f2 = FloatMagic.fromSortable(FloatMagic.toSortable(f));
if (!((Float.isNaN(f) && Float.isNaN(f2)) || f == f2)) {
throw new RuntimeException("! " + Integer.toHexString(i) + "> " + f + " " + f2);
}
if ((i & 0xffffff) == 0) {
System.out.println(Integer.toHexString(i));
}
i++;
} while (i != 0);
}
@Ignore("Once checked, valid forever?") @Test
public void testIncreasingFullRange() {
// -infinity ... -0.0
for (int i = 0xff800000; i != 0x80000000; i--) {
checkSmaller(i, i - 1);
}
// -0.0 +0.0
checkSmaller(0x80000000, 0);
// +0.0 ... +infinity
for (int i = 0; i != 0x7f800000; i++) {
checkSmaller(i, i + 1);
}
// All other are NaNs and should be after positive infinity.
final long infinity = toSortableL(Float.POSITIVE_INFINITY);
for (int i = 0x7f800001; i != 0x7fffffff; i++) {
assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
}
for (int i = 0xff800001; i != 0xffffffff; i++) {
assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
}
}
private long toSortableL(float f) {
return FloatMagic.toSortable(f) & 0xffffffffL;
}
private void checkSmaller(int i1, int i2) {
float f1 = Float.intBitsToFloat(i1);
float f2 = Float.intBitsToFloat(i2);
if (f1 > f2) {
throw new AssertionError(f1 + " " + f2 + " " + i1 + " " + i2);
}
assertTrue(toSortableL(f1) < toSortableL(f2));
}
}

View File

@ -0,0 +1,43 @@
package org.apache.lucene.search.suggest.fst;
import java.io.*;
import org.apache.lucene.util.BytesRef;
/**
* Try to build a suggester from a large data set. The input is a simple text
* file, newline-delimited.
*/
public class LargeInputFST {
public static void main(String[] args) throws IOException {
File input = new File("/home/dweiss/tmp/shuffled.dict");
int buckets = 20;
int shareMaxTail = 10;
ExternalRefSorter sorter = new ExternalRefSorter(new Sort());
FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter, shareMaxTail);
BufferedReader reader = new BufferedReader(
new InputStreamReader(
new FileInputStream(input), "UTF-8"));
BytesRef scratch = new BytesRef();
String line;
int count = 0;
while ((line = reader.readLine()) != null) {
scratch.copyChars(line);
builder.add(scratch, count % buckets);
if ((count++ % 100000) == 0) {
System.err.println("Line: " + count);
}
}
System.out.println("Building FSTCompletion.");
FSTCompletion completion = builder.build();
File fstFile = new File("completion.fst");
System.out.println("Done. Writing automaton: " + fstFile.getAbsolutePath());
completion.getFST().save(fstFile);
}
}

View File

@ -0,0 +1,126 @@
package org.apache.lucene.search.suggest.fst;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.lucene.search.suggest.fst.Sort.BufferSize;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
import org.apache.lucene.search.suggest.fst.Sort.SortInfo;
import org.apache.lucene.util.*;
import org.junit.*;
/**
* Tests for on-disk merge sorting.
*/
public class TestSort extends LuceneTestCase {
private File tempDir;
@Before
public void prepareTempDir() throws IOException {
tempDir = _TestUtil.getTempDir("mergesort");
_TestUtil.rmDir(tempDir);
tempDir.mkdirs();
}
@After
public void cleanup() throws IOException {
if (tempDir != null)
_TestUtil.rmDir(tempDir);
}
@Test
public void testEmpty() throws Exception {
checkSort(new Sort(), new byte [][] {});
}
@Test
public void testSingleLine() throws Exception {
checkSort(new Sort(), new byte [][] {
"Single line only.".getBytes("UTF-8")
});
}
@Test
public void testIntermediateMerges() throws Exception {
// Sort 20 mb worth of data with 1mb buffer, binary merging.
SortInfo info = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), 2),
generateRandom(Sort.MB * 20));
assertTrue(info.mergeRounds > 10);
}
@Test
public void testSmallRandom() throws Exception {
// Sort 20 mb worth of data with 1mb buffer.
SortInfo sortInfo = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
generateRandom(Sort.MB * 20));
assertEquals(1, sortInfo.mergeRounds);
}
@Test @Nightly
public void testLargerRandom() throws Exception {
// Sort 100MB worth of data with 15mb buffer.
checkSort(new Sort(BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
generateRandom(Sort.MB * 100));
}
private byte[][] generateRandom(int howMuchData) {
ArrayList<byte[]> data = new ArrayList<byte[]>();
while (howMuchData > 0) {
byte [] current = new byte [random.nextInt(256)];
random.nextBytes(current);
data.add(current);
howMuchData -= current.length;
}
byte [][] bytes = data.toArray(new byte[data.size()][]);
return bytes;
}
/**
* Check sorting data on an instance of {@link Sort}.
*/
private SortInfo checkSort(Sort sort, byte[][] data) throws IOException {
File unsorted = writeAll("unsorted", data);
Arrays.sort(data, Sort.unsignedByteOrderComparator);
File golden = writeAll("golden", data);
File sorted = new File(tempDir, "sorted");
SortInfo sortInfo = sort.sort(unsorted, sorted);
System.out.println("Input size [MB]: " + unsorted.length() / (1024 * 1024));
System.out.println(sortInfo);
assertFilesIdentical(golden, sorted);
return sortInfo;
}
/**
* Make sure two files are byte-byte identical.
*/
private void assertFilesIdentical(File golden, File sorted) throws IOException {
assertEquals(golden.length(), sorted.length());
byte [] buf1 = new byte [64 * 1024 * 1024];
byte [] buf2 = new byte [64 * 1024 * 1024];
int len;
DataInputStream is1 = new DataInputStream(new FileInputStream(golden));
DataInputStream is2 = new DataInputStream(new FileInputStream(sorted));
while ((len = is1.read(buf1)) > 0) {
is2.readFully(buf2, 0, len);
for (int i = 0; i < len; i++) {
assertEquals(buf1[i], buf2[i]);
}
}
IOUtils.close(is1, is2);
}
private File writeAll(String name, byte[][] data) throws IOException {
File file = new File(tempDir, name);
ByteSequencesWriter w = new Sort.ByteSequencesWriter(file);
for (byte [] datum : data) {
w.write(datum);
}
w.close();
return file;
}
}

View File

@ -199,6 +199,11 @@ New Features
Optimizations
----------------------
* SOLR-2888: FSTSuggester refactoring: internal storage is now UTF-8,
external sorting (on disk) prevents OOMs even with large data sets
(the bottleneck is now FST construction), code cleanups and API cleanups.
(Dawid Weiss)
* SOLR-1875: Per-segment field faceting for single valued string fields.
Enable with facet.method=fcs, control the number of threads used with
the "threads" local param on the facet.field param. This algorithm will

View File

@ -18,16 +18,15 @@ package org.apache.solr.spelling.suggest.fst;
*/
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.FSTLookup;
import org.apache.lucene.search.suggest.fst.*;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.spelling.suggest.LookupFactory;
/**
* Factory for {@link FSTLookup}
* Factory for {@link FSTCompletionLookup}
*/
public class FSTLookupFactory extends LookupFactory {
/**
* The number of separate buckets for weights (discretization). The more buckets,
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
@ -55,6 +54,6 @@ public class FSTLookupFactory extends LookupFactory {
? Boolean.valueOf(params.get(EXACT_MATCH_FIRST).toString())
: true;
return new FSTLookup(buckets, exactMatchFirst);
return new FSTCompletionLookup(buckets, exactMatchFirst);
}
}