mirror of https://github.com/apache/lucene.git
SOLR-2888: FSTSuggester refactoring: internal storage is now UTF-8,
external sorting (on disk) prevents OOMs even with large data sets (the bottleneck is now FST construction), code cleanups and API cleanups. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1209265 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a7772b4958
commit
92de7e8848
|
@ -17,12 +17,22 @@ package org.apache.lucene.util.fst;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.InputStreamDataInput;
|
||||
import org.apache.lucene.store.OutputStreamDataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.CodecUtil;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.fst.Builder.UnCompiledNode;
|
||||
|
||||
// TODO: if FST is pure prefix trie we can do a more compact
|
||||
|
@ -335,6 +345,43 @@ public class FST<T> {
|
|||
out.writeBytes(bytes, 0, bytes.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes an automaton to a file.
|
||||
*/
|
||||
public void save(final File file) throws IOException {
|
||||
boolean success = false;
|
||||
OutputStream os = new BufferedOutputStream(new FileOutputStream(file));
|
||||
try {
|
||||
save(new OutputStreamDataOutput(os));
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(os);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(os);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads an automaton from a file.
|
||||
*/
|
||||
public static <T> FST<T> read(File file, Outputs<T> outputs) throws IOException {
|
||||
InputStream is = new BufferedInputStream(new FileInputStream(file));
|
||||
boolean success = false;
|
||||
try {
|
||||
FST<T> fst = new FST<T>(new InputStreamDataInput(is), outputs);
|
||||
success = true;
|
||||
return fst;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(is);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(is);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void writeLabel(int v) throws IOException {
|
||||
assert v >= 0: "v=" + v;
|
||||
if (inputType == INPUT_TYPE.BYTE1) {
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Collects {@link BytesRef} and then allows one to iterate over their sorted order. Implementations
|
||||
* of this interface will be called in a single-threaded scenario.
|
||||
*/
|
||||
public interface BytesRefSorter {
|
||||
/**
|
||||
* Adds a single suggestion entry (possibly compound with its bucket).
|
||||
*
|
||||
* @throws IOException If an I/O exception occurs.
|
||||
* @throws IllegalStateException If an addition attempt is performed after
|
||||
* a call to {@link #iterator()} has been made.
|
||||
*/
|
||||
void add(BytesRef utf8) throws IOException, IllegalStateException;
|
||||
|
||||
/**
|
||||
* Sorts the entries added in {@link #add(BytesRef)} and returns
|
||||
* an iterator over all sorted entries.
|
||||
*
|
||||
* @throws IOException If an I/O exception occurs.
|
||||
*/
|
||||
Iterator<BytesRef> iterator() throws IOException;
|
||||
}
|
|
@ -0,0 +1,105 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Builds and iterates over sequences stored on disk.
|
||||
*/
|
||||
public class ExternalRefSorter implements BytesRefSorter, Closeable {
|
||||
private final Sort sort;
|
||||
private Sort.ByteSequencesWriter writer;
|
||||
private File input;
|
||||
private File sorted;
|
||||
|
||||
/**
|
||||
* Will buffer all sequences to a temporary file and then sort (all on-disk).
|
||||
*/
|
||||
public ExternalRefSorter(Sort sort) throws IOException {
|
||||
this.sort = sort;
|
||||
this.input = File.createTempFile("RefSorter-", ".raw", Sort.defaultTempDir());
|
||||
this.writer = new Sort.ByteSequencesWriter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(BytesRef utf8) throws IOException {
|
||||
if (writer == null)
|
||||
throw new IllegalStateException();
|
||||
writer.write(utf8);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<BytesRef> iterator() throws IOException {
|
||||
if (sorted == null) {
|
||||
closeWriter();
|
||||
|
||||
sorted = File.createTempFile("RefSorter-", ".sorted", Sort.defaultTempDir());
|
||||
sort.sort(input, sorted);
|
||||
|
||||
input.delete();
|
||||
input = null;
|
||||
}
|
||||
|
||||
return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted));
|
||||
}
|
||||
|
||||
private void closeWriter() throws IOException {
|
||||
if (writer != null) {
|
||||
writer.close();
|
||||
writer = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes any written temporary files.
|
||||
*/
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
try {
|
||||
closeWriter();
|
||||
} finally {
|
||||
if (input != null) input.delete();
|
||||
if (sorted != null) sorted.delete();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over byte refs in a file.
|
||||
*/
|
||||
class ByteSequenceIterator implements Iterator<BytesRef> {
|
||||
private ByteSequencesReader reader;
|
||||
private byte[] next;
|
||||
|
||||
public ByteSequenceIterator(ByteSequencesReader reader) throws IOException {
|
||||
this.reader = reader;
|
||||
this.next = reader.read();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return next != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() {
|
||||
if (next == null) throw new NoSuchElementException();
|
||||
BytesRef r = new BytesRef(next);
|
||||
try {
|
||||
next = reader.read();
|
||||
if (next == null) {
|
||||
reader.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() { throw new UnsupportedOperationException(); }
|
||||
}
|
||||
}
|
|
@ -0,0 +1,381 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.util.*;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FST.Arc;
|
||||
|
||||
/**
|
||||
* Finite state automata based implementation of "autocomplete" functionality.
|
||||
*
|
||||
* @see FSTCompletionBuilder
|
||||
*/
|
||||
|
||||
// TODO: we could store exact weights as outputs from the FST (int4 encoded
|
||||
// floats). This would provide exact outputs from this method and to some
|
||||
// degree allowed post-sorting on a more fine-grained weight.
|
||||
|
||||
// TODO: support for Analyzers (infix suggestions, synonyms?)
|
||||
|
||||
public class FSTCompletion {
|
||||
/**
|
||||
* A single completion for a given key.
|
||||
*/
|
||||
public static final class Completion implements Comparable<Completion> {
|
||||
public final BytesRef utf8;
|
||||
public final int bucket;
|
||||
|
||||
Completion(BytesRef key, int bucket) {
|
||||
this.utf8 = BytesRef.deepCopyOf(key);
|
||||
this.bucket = bucket;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return utf8.utf8ToString() + "/" + bucket;
|
||||
}
|
||||
|
||||
/** @see BytesRef#compareTo(BytesRef) */
|
||||
public int compareTo(Completion o) {
|
||||
return this.utf8.compareTo(o.utf8);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Default number of buckets.
|
||||
*/
|
||||
public static final int DEFAULT_BUCKETS = 10;
|
||||
|
||||
/**
|
||||
* An empty result. Keep this an {@link ArrayList} to keep all the returned
|
||||
* lists of single type (monomorphic calls).
|
||||
*/
|
||||
private static final ArrayList<Completion> EMPTY_RESULT = new ArrayList<Completion>();
|
||||
|
||||
/**
|
||||
* Finite state automaton encoding all the lookup terms. See class notes for
|
||||
* details.
|
||||
*/
|
||||
private final FST<Object> automaton;
|
||||
|
||||
/**
|
||||
* An array of arcs leaving the root automaton state and encoding weights of
|
||||
* all completions in their sub-trees.
|
||||
*/
|
||||
private final Arc<Object>[] rootArcs;
|
||||
|
||||
/**
|
||||
* @see #FSTCompletion(FST, boolean, boolean)
|
||||
*/
|
||||
private boolean exactFirst;
|
||||
|
||||
/**
|
||||
* @see #FSTCompletion(FST, boolean, boolean)
|
||||
*/
|
||||
private boolean higherWeightsFirst;
|
||||
|
||||
/**
|
||||
* @param automaton
|
||||
* Automaton with completions. See {@link FSTCompletionBuilder}.
|
||||
* @param higherWeightsFirst
|
||||
* Return most popular suggestions first. This is the default
|
||||
* behavior for this implementation. Setting it to <code>false</code>
|
||||
* has no effect (use constant term weights to sort alphabetically
|
||||
* only).
|
||||
* @param exactFirst
|
||||
* Find and push an exact match to the first position of the result
|
||||
* list if found.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public FSTCompletion(FST<Object> automaton, boolean higherWeightsFirst, boolean exactFirst) {
|
||||
this.automaton = automaton;
|
||||
if (automaton != null) {
|
||||
this.rootArcs = cacheRootArcs(automaton);
|
||||
} else {
|
||||
this.rootArcs = new Arc[0];
|
||||
}
|
||||
this.higherWeightsFirst = higherWeightsFirst;
|
||||
this.exactFirst = exactFirst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Defaults to higher weights first and exact first.
|
||||
* @see #FSTCompletion(FST, boolean, boolean)
|
||||
*/
|
||||
public FSTCompletion(FST<Object> automaton) {
|
||||
this(automaton, true, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Cache the root node's output arcs starting with completions with the
|
||||
* highest weights.
|
||||
*/
|
||||
@SuppressWarnings({"all"})
|
||||
private static Arc<Object>[] cacheRootArcs(FST<Object> automaton) {
|
||||
try {
|
||||
List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>();
|
||||
Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
|
||||
automaton.readFirstTargetArc(arc, arc);
|
||||
while (true) {
|
||||
rootArcs.add(new Arc<Object>().copyFrom(arc));
|
||||
if (arc.isLast()) break;
|
||||
automaton.readNextArc(arc);
|
||||
}
|
||||
|
||||
Collections.reverse(rootArcs); // we want highest weights first.
|
||||
return rootArcs.toArray(new Arc[rootArcs.size()]);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the first exact match by traversing root arcs, starting from the
|
||||
* arc <code>rootArcIndex</code>.
|
||||
*
|
||||
* @param rootArcIndex
|
||||
* The first root arc index in {@link #rootArcs} to consider when
|
||||
* matching.
|
||||
*
|
||||
* @param utf8
|
||||
* The sequence of utf8 bytes to follow.
|
||||
*
|
||||
* @return Returns the bucket number of the match or <code>null</code> if no
|
||||
* match was found.
|
||||
*/
|
||||
private Integer getExactMatchStartingFromRootArc(
|
||||
int rootArcIndex, BytesRef utf8) {
|
||||
// Get the UTF-8 bytes representation of the input key.
|
||||
try {
|
||||
final FST.Arc<Object> scratch = new FST.Arc<Object>();
|
||||
for (; rootArcIndex < rootArcs.length; rootArcIndex++) {
|
||||
final FST.Arc<Object> rootArc = rootArcs[rootArcIndex];
|
||||
final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
|
||||
|
||||
// Descend into the automaton using the key as prefix.
|
||||
if (descendWithPrefix(arc, utf8)) {
|
||||
automaton.readFirstTargetArc(arc, arc);
|
||||
if (arc.label == FST.END_LABEL) {
|
||||
// Normalize prefix-encoded weight.
|
||||
return rootArc.label;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// Should never happen, but anyway.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
// No match.
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup suggestions to <code>key</code>.
|
||||
*
|
||||
* @param key
|
||||
* The prefix to which suggestions should be sought.
|
||||
* @param num
|
||||
* At most this number of suggestions will be returned.
|
||||
* @return Returns the suggestions, sorted by their approximated weight first
|
||||
* (decreasing) and then alphabetically (UTF-8 codepoint order).
|
||||
*/
|
||||
public List<Completion> lookup(String key, int num) {
|
||||
if (key.length() == 0 || automaton == null) {
|
||||
return EMPTY_RESULT;
|
||||
}
|
||||
|
||||
try {
|
||||
BytesRef keyUtf8 = new BytesRef(key);
|
||||
if (!higherWeightsFirst && rootArcs.length > 1) {
|
||||
// We could emit a warning here (?). An optimal strategy for
|
||||
// alphabetically sorted
|
||||
// suggestions would be to add them with a constant weight -- this saves
|
||||
// unnecessary
|
||||
// traversals and sorting.
|
||||
return lookupSortedAlphabetically(keyUtf8, num);
|
||||
} else {
|
||||
return lookupSortedByWeight(keyUtf8, num, false);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// Should never happen, but anyway.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup suggestions sorted alphabetically <b>if weights are not
|
||||
* constant</b>. This is a workaround: in general, use constant weights for
|
||||
* alphabetically sorted result.
|
||||
*/
|
||||
private List<Completion> lookupSortedAlphabetically(BytesRef key, int num)
|
||||
throws IOException {
|
||||
// Greedily get num results from each weight branch.
|
||||
List<Completion> res = lookupSortedByWeight(key, num, true);
|
||||
|
||||
// Sort and trim.
|
||||
Collections.sort(res);
|
||||
if (res.size() > num) {
|
||||
res = res.subList(0, num);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup suggestions sorted by weight (descending order).
|
||||
*
|
||||
* @param collectAll
|
||||
* If <code>true</code>, the routine terminates immediately when
|
||||
* <code>num</code> suggestions have been collected. If
|
||||
* <code>false</code>, it will collect suggestions from all weight
|
||||
* arcs (needed for {@link #lookupSortedAlphabetically}.
|
||||
*/
|
||||
private ArrayList<Completion> lookupSortedByWeight(BytesRef key,
|
||||
int num, boolean collectAll) throws IOException {
|
||||
// Don't overallocate the results buffers. This also serves the purpose of
|
||||
// allowing the user of this class to request all matches using Integer.MAX_VALUE as
|
||||
// the number of results.
|
||||
final ArrayList<Completion> res = new ArrayList<Completion>(Math.min(10, num));
|
||||
|
||||
final BytesRef output = BytesRef.deepCopyOf(key);
|
||||
for (int i = 0; i < rootArcs.length; i++) {
|
||||
final FST.Arc<Object> rootArc = rootArcs[i];
|
||||
final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
|
||||
|
||||
// Descend into the automaton using the key as prefix.
|
||||
if (descendWithPrefix(arc, key)) {
|
||||
// A subgraph starting from the current node has the completions
|
||||
// of the key prefix. The arc we're at is the last key's byte,
|
||||
// so we will collect it too.
|
||||
output.length = key.length - 1;
|
||||
if (collect(res, num, rootArc.label, output, arc) && !collectAll) {
|
||||
// We have enough suggestions to return immediately. Keep on looking
|
||||
// for an
|
||||
// exact match, if requested.
|
||||
if (exactFirst) {
|
||||
if (!checkExistingAndReorder(res, key)) {
|
||||
Integer exactMatchBucket = getExactMatchStartingFromRootArc(i, key);
|
||||
if (exactMatchBucket != null) {
|
||||
// Insert as the first result and truncate at num.
|
||||
while (res.size() >= num) {
|
||||
res.remove(res.size() - 1);
|
||||
}
|
||||
res.add(0, new Completion(key, exactMatchBucket));
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the list of
|
||||
* {@link org.apache.lucene.search.suggest.Lookup.LookupResult}s already has a
|
||||
* <code>key</code>. If so, reorders that
|
||||
* {@link org.apache.lucene.search.suggest.Lookup.LookupResult} to the first
|
||||
* position.
|
||||
*
|
||||
* @return Returns <code>true<code> if and only if <code>list</code> contained
|
||||
* <code>key</code>.
|
||||
*/
|
||||
private boolean checkExistingAndReorder(ArrayList<Completion> list, BytesRef key) {
|
||||
// We assume list does not have duplicates (because of how the FST is created).
|
||||
for (int i = list.size(); --i >= 0;) {
|
||||
if (key.equals(list.get(i).utf8)) {
|
||||
// Key found. Unless already at i==0, remove it and push up front so
|
||||
// that the ordering
|
||||
// remains identical with the exception of the exact match.
|
||||
list.add(0, list.remove(i));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Descend along the path starting at <code>arc</code> and going through bytes
|
||||
* in the argument.
|
||||
*
|
||||
* @param arc
|
||||
* The starting arc. This argument is modified in-place.
|
||||
* @param utf8
|
||||
* The term to descend along.
|
||||
* @return If <code>true</code>, <code>arc</code> will be set to the arc
|
||||
* matching last byte of <code>term</code>. <code>false</code> is
|
||||
* returned if no such prefix exists.
|
||||
*/
|
||||
private boolean descendWithPrefix(Arc<Object> arc, BytesRef utf8)
|
||||
throws IOException {
|
||||
final int max = utf8.offset + utf8.length;
|
||||
for (int i = utf8.offset; i < max; i++) {
|
||||
if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc) == null) {
|
||||
// No matching prefixes, return an empty result.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursive collect lookup results from the automaton subgraph starting at
|
||||
* <code>arc</code>.
|
||||
*
|
||||
* @param num
|
||||
* Maximum number of results needed (early termination).
|
||||
*/
|
||||
private boolean collect(List<Completion> res, int num, int bucket,
|
||||
BytesRef output, Arc<Object> arc) throws IOException {
|
||||
if (output.length == output.bytes.length) {
|
||||
output.bytes = ArrayUtil.grow(output.bytes);
|
||||
}
|
||||
assert output.offset == 0;
|
||||
output.bytes[output.length++] = (byte) arc.label;
|
||||
|
||||
automaton.readFirstTargetArc(arc, arc);
|
||||
while (true) {
|
||||
if (arc.label == FST.END_LABEL) {
|
||||
res.add(new Completion(output, bucket));
|
||||
if (res.size() >= num) return true;
|
||||
} else {
|
||||
int save = output.length;
|
||||
if (collect(res, num, bucket, output, new Arc<Object>().copyFrom(arc))) {
|
||||
return true;
|
||||
}
|
||||
output.length = save;
|
||||
}
|
||||
|
||||
if (arc.isLast()) {
|
||||
break;
|
||||
}
|
||||
automaton.readNextArc(arc);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the bucket count (discretization thresholds).
|
||||
*/
|
||||
public int getBucketCount() {
|
||||
return rootArcs.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the bucket assigned to a given key (if found) or <code>null</code> if
|
||||
* no exact match exists.
|
||||
*/
|
||||
public Integer getBucket(String key) {
|
||||
return getExactMatchStartingFromRootArc(0, new BytesRef(key));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the internal automaton.
|
||||
*/
|
||||
public FST<Object> getFST() {
|
||||
return automaton;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,233 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.fst.*;
|
||||
|
||||
/**
|
||||
* Finite state automata based implementation of "autocomplete" functionality.
|
||||
*
|
||||
* <h2>Implementation details</h2>
|
||||
*
|
||||
* <p>
|
||||
* The construction step in {@link #finalize()} works as follows:
|
||||
* <ul>
|
||||
* <li>A set of input terms and their buckets is given.</li>
|
||||
* <li>All terms in the input are prefixed with a synthetic pseudo-character
|
||||
* (code) of the weight bucket the term fell into. For example a term
|
||||
* <code>abc</code> with a discretized weight equal '1' would become
|
||||
* <code>1abc</code>.</li>
|
||||
* <li>The terms are then sorted by their raw value of UTF-8 character values
|
||||
* (including the synthetic bucket code in front).</li>
|
||||
* <li>A finite state automaton ({@link FST}) is constructed from the input. The
|
||||
* root node has arcs labeled with all possible weights. We cache all these
|
||||
* arcs, highest-weight first.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* At runtime, in {@link FSTCompletion#lookup(String, int)},
|
||||
* the automaton is utilized as follows:
|
||||
* <ul>
|
||||
* <li>For each possible term weight encoded in the automaton (cached arcs from
|
||||
* the root above), starting with the highest one, we descend along the path of
|
||||
* the input key. If the key is not a prefix of a sequence in the automaton
|
||||
* (path ends prematurely), we exit immediately -- no completions.</li>
|
||||
* <li>Otherwise, we have found an internal automaton node that ends the key.
|
||||
* <b>The entire subautomaton (all paths) starting from this node form the key's
|
||||
* completions.</b> We start the traversal of this subautomaton. Every time we
|
||||
* reach a final state (arc), we add a single suggestion to the list of results
|
||||
* (the weight of this suggestion is constant and equal to the root path we
|
||||
* started from). The tricky part is that because automaton edges are sorted and
|
||||
* we scan depth-first, we can terminate the entire procedure as soon as we
|
||||
* collect enough suggestions the user requested.</li>
|
||||
* <li>In case the number of suggestions collected in the step above is still
|
||||
* insufficient, we proceed to the next (smaller) weight leaving the root node
|
||||
* and repeat the same algorithm again.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <h2>Runtime behavior and performance characteristic</h2>
|
||||
*
|
||||
* The algorithm described above is optimized for finding suggestions to short
|
||||
* prefixes in a top-weights-first order. This is probably the most common use
|
||||
* case: it allows presenting suggestions early and sorts them by the global
|
||||
* frequency (and then alphabetically).
|
||||
*
|
||||
* <p>
|
||||
* If there is an exact match in the automaton, it is returned first on the
|
||||
* results list (even with by-weight sorting).
|
||||
*
|
||||
* <p>
|
||||
* Note that the maximum lookup time for <b>any prefix</b> is the time of
|
||||
* descending to the subtree, plus traversal of the subtree up to the number of
|
||||
* requested suggestions (because they are already presorted by weight on the
|
||||
* root level and alphabetically at any node level).
|
||||
*
|
||||
* <p>
|
||||
* To order alphabetically only (no ordering by priorities), use identical term
|
||||
* weights for all terms. Alphabetical suggestions are returned even if
|
||||
* non-constant weights are used, but the algorithm for doing this is
|
||||
* suboptimal.
|
||||
*
|
||||
* <p>
|
||||
* "alphabetically" in any of the documentation above indicates UTF-8
|
||||
* representation order, nothing else.
|
||||
*
|
||||
* <p>
|
||||
* <b>NOTE</b>: the FST file format is experimental and subject to suddenly
|
||||
* change, requiring you to rebuild the FST suggest index.
|
||||
*
|
||||
* @see FSTCompletion
|
||||
*/
|
||||
public class FSTCompletionBuilder {
|
||||
/**
|
||||
* Default number of buckets.
|
||||
*/
|
||||
public static final int DEFAULT_BUCKETS = 10;
|
||||
|
||||
/**
|
||||
* The number of separate buckets for weights (discretization). The more
|
||||
* buckets, the more fine-grained term weights (priorities) can be assigned.
|
||||
* The speed of lookup will not decrease for prefixes which have
|
||||
* highly-weighted completions (because these are filled-in first), but will
|
||||
* decrease significantly for low-weighted terms (but these should be
|
||||
* infrequent, so it is all right).
|
||||
*
|
||||
* <p>
|
||||
* The number of buckets must be within [1, 255] range.
|
||||
*/
|
||||
private final int buckets;
|
||||
|
||||
/**
|
||||
* Finite state automaton encoding all the lookup terms. See class notes for
|
||||
* details.
|
||||
*/
|
||||
FST<Object> automaton;
|
||||
|
||||
/**
|
||||
* FST construction require re-sorting the input. This is the class that
|
||||
* collects all the input entries, their weights and then provides sorted
|
||||
* order.
|
||||
*/
|
||||
private final BytesRefSorter sorter;
|
||||
|
||||
/**
|
||||
* Scratch buffer for {@link #add(BytesRef, int)}.
|
||||
*/
|
||||
private final BytesRef scratch = new BytesRef();
|
||||
|
||||
/**
|
||||
* Max tail sharing length.
|
||||
*/
|
||||
private final int shareMaxTailLength;
|
||||
|
||||
/**
|
||||
* Creates an {@link FSTCompletion} with default options: 10 buckets, exact match
|
||||
* promoted to first position and {@link InMemorySorter}.
|
||||
*/
|
||||
public FSTCompletionBuilder() {
|
||||
this(DEFAULT_BUCKETS, new InMemorySorter(), Integer.MAX_VALUE);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param buckets
|
||||
* The number of buckets for weight discretization. Buckets are used
|
||||
* in {@link #add(BytesRef, int)} and must be smaller than the number
|
||||
* given here.
|
||||
*
|
||||
* @param sorter
|
||||
* {@link BytesRefSorter} used for re-sorting input for the automaton.
|
||||
* For large inputs, use on-disk sorting implementations. The sorter
|
||||
* is closed automatically in {@link #build()} if it implements
|
||||
* {@link Closeable}.
|
||||
*
|
||||
* @param shareMaxTailLength
|
||||
* Max shared suffix sharing length.
|
||||
*
|
||||
* See the description of this parameter in {@link Builder}'s constructor.
|
||||
* In general, for very large inputs you'll want to construct a non-minimal
|
||||
* automaton which will be larger, but the construction will take far less ram.
|
||||
* For minimal automata, set it to {@link Integer#MAX_VALUE}.
|
||||
*/
|
||||
public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, int shareMaxTailLength) {
|
||||
if (buckets < 1 || buckets > 255) {
|
||||
throw new IllegalArgumentException("Buckets must be >= 1 and <= 255: "
|
||||
+ buckets);
|
||||
}
|
||||
|
||||
if (sorter == null) throw new IllegalArgumentException(
|
||||
"BytesRefSorter must not be null.");
|
||||
|
||||
this.sorter = sorter;
|
||||
this.buckets = buckets;
|
||||
this.shareMaxTailLength = shareMaxTailLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends a single suggestion and its weight to the internal buffers.
|
||||
*
|
||||
* @param utf8
|
||||
* The suggestion (utf8 representation) to be added. The content is
|
||||
* copied and the object can be reused.
|
||||
* @param bucket
|
||||
* The bucket to place this suggestion in. Must be non-negative and
|
||||
* smaller than the number of buckets passed in the constructor.
|
||||
* Higher numbers indicate suggestions that should be presented
|
||||
* before suggestions placed in smaller buckets.
|
||||
*/
|
||||
public void add(BytesRef utf8, int bucket) throws IOException {
|
||||
if (bucket < 0 || bucket >= buckets) {
|
||||
throw new IllegalArgumentException(
|
||||
"Bucket outside of the allowed range [0, " + buckets + "): " + bucket);
|
||||
}
|
||||
|
||||
if (scratch.bytes.length < utf8.length + 1) {
|
||||
scratch.grow(utf8.length + 10);
|
||||
}
|
||||
|
||||
scratch.length = 1;
|
||||
scratch.bytes[0] = (byte) bucket;
|
||||
scratch.append(utf8);
|
||||
sorter.add(scratch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds the final automaton from a list of added entries. This method may
|
||||
* take a longer while as it needs to build the automaton.
|
||||
*/
|
||||
public FSTCompletion build() throws IOException {
|
||||
this.automaton = buildAutomaton(sorter);
|
||||
|
||||
if (sorter instanceof Closeable) {
|
||||
((Closeable) sorter).close();
|
||||
}
|
||||
|
||||
return new FSTCompletion(automaton);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds the final automaton from a list of entries.
|
||||
*/
|
||||
private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
|
||||
// Build the automaton.
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Object empty = outputs.getNoOutput();
|
||||
final Builder<Object> builder = new Builder<Object>(
|
||||
FST.INPUT_TYPE.BYTE1, 0, 0, true, true,
|
||||
shareMaxTailLength, outputs, null);
|
||||
|
||||
BytesRef scratch = new BytesRef();
|
||||
int count = 0;
|
||||
for (Iterator<BytesRef> i = sorter.iterator(); i.hasNext(); count++) {
|
||||
BytesRef entry = i.next();
|
||||
if (scratch.compareTo(entry) != 0) {
|
||||
builder.add(entry, empty);
|
||||
scratch.copyBytes(entry);
|
||||
}
|
||||
}
|
||||
|
||||
return count == 0 ? null : builder.finish();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,251 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
|
||||
import org.apache.lucene.search.suggest.fst.Sort.SortInfo;
|
||||
import org.apache.lucene.search.suggest.tst.TSTLookup;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.util.*;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.NoOutputs;
|
||||
|
||||
/**
|
||||
* An adapter from {@link Lookup} API to {@link FSTCompletion}.
|
||||
*
|
||||
* <p>This adapter differs from {@link FSTCompletion} in that it attempts
|
||||
* to discretize any "weights" as passed from in {@link TermFreqIterator#freq()}
|
||||
* to match the number of buckets. For the rationale for bucketing, see
|
||||
* {@link FSTCompletion}.
|
||||
*
|
||||
* <p><b>Note:</b>Discretization requires an additional sorting pass.
|
||||
*
|
||||
* <p>The range of weights for bucketing/ discretization is determined
|
||||
* by sorting the input by weight and then dividing into
|
||||
* equal ranges. Then, scores within each range are assigned to that bucket.
|
||||
*
|
||||
* <p>Note that this means that even large differences in weights may be lost
|
||||
* during automaton construction, but the overall distinction between "classes"
|
||||
* of weights will be preserved regardless of the distribution of weights.
|
||||
*
|
||||
* <p>For fine-grained control over which weights are assigned to which buckets,
|
||||
* use {@link FSTCompletion} directly or {@link TSTLookup}, for example.
|
||||
*
|
||||
* @see FSTCompletion
|
||||
*/
|
||||
public class FSTCompletionLookup extends Lookup {
|
||||
/**
|
||||
* Shared tail length for conflating in the created automaton. Setting this
|
||||
* to larger values ({@link Integer#MAX_VALUE}) will create smaller (or minimal)
|
||||
* automata at the cost of RAM for keeping nodes hash in the {@link FST}.
|
||||
*
|
||||
* <p>Empirical pick.
|
||||
*/
|
||||
private final static int sharedTailLength = 5;
|
||||
|
||||
/**
|
||||
* File name for the automaton.
|
||||
*
|
||||
* @see #store(File)
|
||||
* @see #load(File)
|
||||
*/
|
||||
private static final String FILENAME = "fst.bin";
|
||||
|
||||
private int buckets;
|
||||
private boolean exactMatchFirst;
|
||||
|
||||
/**
|
||||
* Automaton used for completions with higher weights reordering.
|
||||
*/
|
||||
private FSTCompletion higherWeightsCompletion;
|
||||
|
||||
/**
|
||||
* Automaton used for normal completions.
|
||||
*/
|
||||
private FSTCompletion normalCompletion;
|
||||
|
||||
/*
|
||||
*
|
||||
*/
|
||||
public FSTCompletionLookup() {
|
||||
this(FSTCompletion.DEFAULT_BUCKETS, true);
|
||||
}
|
||||
|
||||
/*
|
||||
*
|
||||
*/
|
||||
public FSTCompletionLookup(FSTCompletion completion, int buckets, boolean exactMatchFirst) {
|
||||
this(buckets, exactMatchFirst);
|
||||
this.normalCompletion = new FSTCompletion(
|
||||
completion.getFST(), false, exactMatchFirst);
|
||||
this.higherWeightsCompletion = new FSTCompletion(
|
||||
completion.getFST(), true, exactMatchFirst);
|
||||
}
|
||||
|
||||
/*
|
||||
*
|
||||
*/
|
||||
public FSTCompletionLookup(int buckets, boolean exactMatchFirst) {
|
||||
this.buckets = buckets;
|
||||
this.exactMatchFirst = exactMatchFirst;
|
||||
}
|
||||
|
||||
/*
|
||||
*
|
||||
*/
|
||||
@Override
|
||||
public void build(TermFreqIterator tfit) throws IOException {
|
||||
File tempInput = File.createTempFile(
|
||||
FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir());
|
||||
File tempSorted = File.createTempFile(
|
||||
FSTCompletionLookup.class.getSimpleName(), ".sorted", Sort.defaultTempDir());
|
||||
|
||||
Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
|
||||
Sort.ByteSequencesReader reader = null;
|
||||
|
||||
// Push floats up front before sequences to sort them. For now, assume they are non-negative.
|
||||
// If negative floats are allowed some trickery needs to be done to find their byte order.
|
||||
boolean success = false;
|
||||
try {
|
||||
BytesRef tmp1 = new BytesRef();
|
||||
byte [] buffer = new byte [0];
|
||||
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
|
||||
while (tfit.hasNext()) {
|
||||
String key = tfit.next();
|
||||
UnicodeUtil.UTF16toUTF8(key, 0, key.length(), tmp1);
|
||||
|
||||
if (tmp1.length + 4 >= buffer.length) {
|
||||
buffer = ArrayUtil.grow(buffer, tmp1.length + 4);
|
||||
}
|
||||
|
||||
output.reset(buffer);
|
||||
output.writeInt(FloatMagic.toSortable(tfit.freq()));
|
||||
output.writeBytes(tmp1.bytes, tmp1.offset, tmp1.length);
|
||||
writer.write(buffer, 0, output.getPosition());
|
||||
}
|
||||
writer.close();
|
||||
|
||||
// We don't know the distribution of scores and we need to bucket them, so we'll sort
|
||||
// and divide into equal buckets.
|
||||
SortInfo info = new Sort().sort(tempInput, tempSorted);
|
||||
tempInput.delete();
|
||||
FSTCompletionBuilder builder = new FSTCompletionBuilder(
|
||||
buckets, new ExternalRefSorter(new Sort()), sharedTailLength);
|
||||
|
||||
final int inputLines = info.lines;
|
||||
reader = new Sort.ByteSequencesReader(tempSorted);
|
||||
long line = 0;
|
||||
int previousBucket = 0;
|
||||
float previousScore = 0;
|
||||
ByteArrayDataInput input = new ByteArrayDataInput();
|
||||
BytesRef tmp2 = new BytesRef();
|
||||
while (reader.read(tmp1)) {
|
||||
input.reset(tmp1.bytes);
|
||||
float currentScore = FloatMagic.fromSortable(input.readInt());
|
||||
|
||||
int bucket;
|
||||
if (line > 0 && currentScore == previousScore) {
|
||||
bucket = previousBucket;
|
||||
} else {
|
||||
bucket = (int) (line * buckets / inputLines);
|
||||
}
|
||||
previousScore = currentScore;
|
||||
previousBucket = bucket;
|
||||
|
||||
// Only append the input, discard the weight.
|
||||
tmp2.bytes = tmp1.bytes;
|
||||
tmp2.offset = input.getPosition();
|
||||
tmp2.length = tmp1.length - input.getPosition();
|
||||
builder.add(tmp2, bucket);
|
||||
|
||||
line++;
|
||||
}
|
||||
|
||||
// The two FSTCompletions share the same automaton.
|
||||
this.higherWeightsCompletion = builder.build();
|
||||
this.normalCompletion = new FSTCompletion(
|
||||
higherWeightsCompletion.getFST(), false, exactMatchFirst);
|
||||
|
||||
success = true;
|
||||
} finally {
|
||||
if (success)
|
||||
IOUtils.close(reader, writer);
|
||||
else
|
||||
IOUtils.closeWhileHandlingException(reader, writer);
|
||||
|
||||
tempInput.delete();
|
||||
tempSorted.delete();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<LookupResult> lookup(String key, boolean higherWeightsFirst, int num) {
|
||||
final List<Completion> completions;
|
||||
if (higherWeightsFirst) {
|
||||
completions = higherWeightsCompletion.lookup(key, num);
|
||||
} else {
|
||||
completions = normalCompletion.lookup(key, num);
|
||||
}
|
||||
|
||||
final ArrayList<LookupResult> results = new ArrayList<LookupResult>(completions.size());
|
||||
for (Completion c : completions) {
|
||||
results.add(new LookupResult(c.utf8.utf8ToString(), c.bucket));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(String key, Object value) {
|
||||
// Not supported.
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Float get(String key) {
|
||||
Integer bucket = normalCompletion.getBucket(key);
|
||||
if (bucket == null)
|
||||
return null;
|
||||
else
|
||||
return (float) normalCompletion.getBucket(key) / normalCompletion.getBucketCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* Deserialization from disk.
|
||||
*/
|
||||
@Override
|
||||
public synchronized boolean load(File storeDir) throws IOException {
|
||||
File data = new File(storeDir, FILENAME);
|
||||
if (!data.exists() || !data.canRead()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
this.higherWeightsCompletion = new FSTCompletion(
|
||||
FST.read(data, NoOutputs.getSingleton()));
|
||||
this.normalCompletion = new FSTCompletion(
|
||||
higherWeightsCompletion.getFST(), false, exactMatchFirst);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialization to disk.
|
||||
*/
|
||||
@Override
|
||||
public synchronized boolean store(File storeDir) throws IOException {
|
||||
if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (this.normalCompletion == null)
|
||||
return false;
|
||||
|
||||
normalCompletion.getFST().save(new File(storeDir, FILENAME));
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -1,570 +0,0 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FST.Arc;
|
||||
import org.apache.lucene.util.fst.NoOutputs;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.tst.TSTLookup;
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.store.InputStreamDataInput;
|
||||
import org.apache.lucene.store.OutputStreamDataOutput;
|
||||
|
||||
/**
|
||||
* Finite state automata based implementation of {@link Lookup} query
|
||||
* suggestion/ autocomplete interface.
|
||||
*
|
||||
* <h2>Implementation details</h2>
|
||||
*
|
||||
* <p>The construction step in {@link #build(TermFreqIterator)} works as follows:
|
||||
* <ul>
|
||||
* <li>A set of input terms (String) and weights (float) is given.</li>
|
||||
* <li>The range of weights is determined and then all weights are discretized into a fixed set
|
||||
* of values ({@link #buckets}).
|
||||
* Note that this means that minor changes in weights may be lost during automaton construction.
|
||||
* In general, this is not a big problem because the "priorities" of completions can be split
|
||||
* into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
|
||||
* If you need exact, fine-grained weights, use {@link TSTLookup} instead.<li>
|
||||
* <li>All terms in the input are preprended with a synthetic pseudo-character being the weight
|
||||
* of that term. For example a term <code>abc</code> with a discretized weight equal '1' would
|
||||
* become <code>1abc</code>.</li>
|
||||
* <li>The terms are sorted by their raw value of utf16 character values (including the synthetic
|
||||
* term in front).</li>
|
||||
* <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
|
||||
* arcs labeled with all possible weights. We cache all these arcs, highest-weight first.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
|
||||
* <ul>
|
||||
* <li>For each possible term weight encoded in the automaton (cached arcs from the root above),
|
||||
* starting with the highest one, we descend along the path of the input key. If the key is not
|
||||
* a prefix of a sequence in the automaton (path ends prematurely), we exit immediately.
|
||||
* No completions.
|
||||
* <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire
|
||||
* subautomaton (all paths) starting from this node form the key's completions.</b> We start
|
||||
* the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
|
||||
* suggestion to the list of results (the weight of this suggestion is constant and equal to the
|
||||
* root path we started from). The tricky part is that because automaton edges are sorted and
|
||||
* we scan depth-first, we can terminate the entire procedure as soon as we collect enough
|
||||
* suggestions the user requested.
|
||||
* <li>In case the number of suggestions collected in the step above is still insufficient,
|
||||
* we proceed to the next (smaller) weight leaving the root node and repeat the same
|
||||
* algorithm again.
|
||||
* </li>
|
||||
* </ul>
|
||||
*
|
||||
* <h2>Runtime behavior and performance characteristic</h2>
|
||||
*
|
||||
* <p>The algorithm described above is optimized for finding suggestions to short prefixes
|
||||
* in a top-weights-first order. This is probably the most common use case: it allows
|
||||
* presenting suggestions early and sorts them by the global frequency (and then alphabetically).
|
||||
*
|
||||
* <p>If there is an exact match in the automaton, it is returned first on the results
|
||||
* list (even with by-weight sorting).
|
||||
*
|
||||
* <p>Note that the maximum lookup time for <b>any prefix</b>
|
||||
* is the time of descending to the subtree, plus traversal of the subtree up to the number
|
||||
* of requested suggestions (because they are already presorted by weight on the root level
|
||||
* and alphabetically at any node level).
|
||||
*
|
||||
* <p>To order alphabetically only (no ordering by priorities), use identical term weights
|
||||
* for all terms. Alphabetical suggestions are returned even if non-constant weights are
|
||||
* used, but the algorithm for doing this is suboptimal.
|
||||
*
|
||||
* <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
|
||||
* nothing else.
|
||||
*
|
||||
* <b>NOTE</b>: the FST file format is experimental and
|
||||
* subject to suddenly change, requiring you to rebuild the
|
||||
* FST suggest index.
|
||||
*/
|
||||
public class FSTLookup extends Lookup {
|
||||
|
||||
public FSTLookup() {
|
||||
this(10, true);
|
||||
}
|
||||
|
||||
public FSTLookup(int buckets, boolean exactMatchFirst) {
|
||||
this.buckets = buckets;
|
||||
this.exactMatchFirst = exactMatchFirst;
|
||||
}
|
||||
|
||||
/** A structure for a single entry (for sorting/ preprocessing). */
|
||||
private static class Entry {
|
||||
char [] term;
|
||||
float weight;
|
||||
|
||||
public Entry(char [] term, float freq) {
|
||||
this.term = term;
|
||||
this.weight = freq;
|
||||
}
|
||||
}
|
||||
|
||||
/** Serialized automaton file name (storage). */
|
||||
public static final String FILENAME = "fst.dat";
|
||||
|
||||
/** An empty result. */
|
||||
private static final List<LookupResult> EMPTY_RESULT = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* The number of separate buckets for weights (discretization). The more buckets,
|
||||
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
|
||||
* will not decrease for prefixes which have highly-weighted completions (because these
|
||||
* are filled-in first), but will decrease significantly for low-weighted terms (but
|
||||
* these should be infrequent, so it is all right).
|
||||
*
|
||||
* <p>The number of buckets must be within [1, 255] range.
|
||||
*/
|
||||
private final int buckets;
|
||||
|
||||
/**
|
||||
* If <code>true</code>, exact suggestions are returned first, even if they are prefixes
|
||||
* of other strings in the automaton (possibly with larger weights).
|
||||
*/
|
||||
private final boolean exactMatchFirst;
|
||||
|
||||
/**
|
||||
* Finite state automaton encoding all the lookup terms. See class
|
||||
* notes for details.
|
||||
*/
|
||||
private FST<Object> automaton;
|
||||
|
||||
/**
|
||||
* An array of arcs leaving the root automaton state and encoding weights of all
|
||||
* completions in their sub-trees.
|
||||
*/
|
||||
private Arc<Object> [] rootArcs;
|
||||
|
||||
/* */
|
||||
@Override
|
||||
public void build(TermFreqIterator tfit) throws IOException {
|
||||
// Buffer the input because we will need it twice: for calculating
|
||||
// weights distribution and for the actual automata building.
|
||||
List<Entry> entries = new ArrayList<Entry>();
|
||||
while (tfit.hasNext()) {
|
||||
String term = tfit.next();
|
||||
char [] termChars = new char [term.length() + 1]; // add padding for weight.
|
||||
for (int i = 0; i < term.length(); i++)
|
||||
termChars[i + 1] = term.charAt(i);
|
||||
entries.add(new Entry(termChars, tfit.freq()));
|
||||
}
|
||||
|
||||
// Distribute weights into at most N buckets. This is a form of discretization to
|
||||
// limit the number of possible weights so that they can be efficiently encoded in the
|
||||
// automaton.
|
||||
//
|
||||
// It is assumed the distribution of weights is _linear_ so proportional division
|
||||
// of [min, max] range will be enough here. Other approaches could be to sort
|
||||
// weights and divide into proportional ranges.
|
||||
if (entries.size() > 0) {
|
||||
redistributeWeightsProportionalMinMax(entries, buckets);
|
||||
encodeWeightPrefix(entries);
|
||||
}
|
||||
|
||||
// Build the automaton (includes input sorting) and cache root arcs in order from the highest,
|
||||
// to the lowest weight.
|
||||
this.automaton = buildAutomaton(entries);
|
||||
cacheRootArcs();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cache the root node's output arcs starting with completions with the highest weights.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
private void cacheRootArcs() throws IOException {
|
||||
if (automaton != null) {
|
||||
List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>();
|
||||
Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
|
||||
automaton.readFirstTargetArc(arc, arc);
|
||||
while (true) {
|
||||
rootArcs.add(new Arc<Object>().copyFrom(arc));
|
||||
if (arc.isLast())
|
||||
break;
|
||||
automaton.readNextArc(arc);
|
||||
}
|
||||
|
||||
Collections.reverse(rootArcs); // we want highest weights first.
|
||||
this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Not implemented.
|
||||
*/
|
||||
@Override
|
||||
public boolean add(String key, Object value) {
|
||||
// This implementation does not support ad-hoc additions (all input
|
||||
// must be sorted for the builder).
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the (approximated) weight of a single key (if there is a perfect match
|
||||
* for it in the automaton).
|
||||
*
|
||||
* @return Returns the approximated weight of the input key or <code>null</code>
|
||||
* if not found.
|
||||
*/
|
||||
@Override
|
||||
public Float get(String key) {
|
||||
return getExactMatchStartingFromRootArc(0, key);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the first exact match by traversing root arcs, starting from
|
||||
* the arc <code>i</code>.
|
||||
*
|
||||
* @param i The first root arc index in {@link #rootArcs} to consider when
|
||||
* matching.
|
||||
*/
|
||||
private Float getExactMatchStartingFromRootArc(int i, String key) {
|
||||
// Get the UTF-8 bytes representation of the input key.
|
||||
try {
|
||||
final FST.Arc<Object> scratch = new FST.Arc<Object>();
|
||||
for (; i < rootArcs.length; i++) {
|
||||
final FST.Arc<Object> rootArc = rootArcs[i];
|
||||
final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
|
||||
|
||||
// Descend into the automaton using the key as prefix.
|
||||
if (descendWithPrefix(arc, key)) {
|
||||
automaton.readFirstTargetArc(arc, arc);
|
||||
if (arc.label == FST.END_LABEL) {
|
||||
// Prefix-encoded weight.
|
||||
return rootArc.label / (float) buckets;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// Should never happen, but anyway.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup autocomplete suggestions to <code>key</code>.
|
||||
*
|
||||
* @param key The prefix to which suggestions should be sought.
|
||||
* @param onlyMorePopular Return most popular suggestions first. This is the default
|
||||
* behavior for this implementation. Setting it to <code>false</code> has no effect (use
|
||||
* constant term weights to sort alphabetically only).
|
||||
* @param num At most this number of suggestions will be returned.
|
||||
* @return Returns the suggestions, sorted by their approximated weight first (decreasing)
|
||||
* and then alphabetically (utf16 codepoint order).
|
||||
*/
|
||||
@Override
|
||||
public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) {
|
||||
if (key.length() == 0 || automaton == null) {
|
||||
// Keep the result an ArrayList to keep calls monomorphic.
|
||||
return EMPTY_RESULT;
|
||||
}
|
||||
|
||||
try {
|
||||
if (!onlyMorePopular && rootArcs.length > 1) {
|
||||
// We could emit a warning here (?). An optimal strategy for alphabetically sorted
|
||||
// suggestions would be to add them with a constant weight -- this saves unnecessary
|
||||
// traversals and sorting.
|
||||
return lookupSortedAlphabetically(key, num);
|
||||
} else {
|
||||
return lookupSortedByWeight(key, num, false);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// Should never happen, but anyway.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup suggestions sorted alphabetically <b>if weights are not constant</b>. This
|
||||
* is a workaround: in general, use constant weights for alphabetically sorted result.
|
||||
*/
|
||||
private List<LookupResult> lookupSortedAlphabetically(String key, int num) throws IOException {
|
||||
// Greedily get num results from each weight branch.
|
||||
List<LookupResult> res = lookupSortedByWeight(key, num, true);
|
||||
|
||||
// Sort and trim.
|
||||
Collections.sort(res, new Comparator<LookupResult>() {
|
||||
// not till java6 @Override
|
||||
public int compare(LookupResult o1, LookupResult o2) {
|
||||
return o1.key.compareTo(o2.key);
|
||||
}
|
||||
});
|
||||
if (res.size() > num) {
|
||||
res = res.subList(0, num);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup suggestions sorted by weight (descending order).
|
||||
*
|
||||
* @param collectAll If <code>true</code>, the routine terminates immediately when <code>num</code>
|
||||
* suggestions have been collected. If <code>false</code>, it will collect suggestions from
|
||||
* all weight arcs (needed for {@link #lookupSortedAlphabetically}.
|
||||
*/
|
||||
private ArrayList<LookupResult> lookupSortedByWeight(String key, int num, boolean collectAll) throws IOException {
|
||||
// Don't overallocate the results buffers. This also serves the purpose of allowing
|
||||
// the user of this class to request all matches using Integer.MAX_VALUE as the number
|
||||
// of results.
|
||||
final ArrayList<LookupResult> res = new ArrayList<LookupResult>(Math.min(10, num));
|
||||
final StringBuilder output = new StringBuilder(key);
|
||||
final int matchLength = key.length() - 1;
|
||||
|
||||
for (int i = 0; i < rootArcs.length; i++) {
|
||||
final FST.Arc<Object> rootArc = rootArcs[i];
|
||||
final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
|
||||
|
||||
// Descend into the automaton using the key as prefix.
|
||||
if (descendWithPrefix(arc, key)) {
|
||||
// Prefix-encoded weight.
|
||||
final float weight = rootArc.label / (float) buckets;
|
||||
|
||||
// A subgraph starting from the current node has the completions
|
||||
// of the key prefix. The arc we're at is the last key's byte,
|
||||
// so we will collect it too.
|
||||
output.setLength(matchLength);
|
||||
if (collect(res, num, weight, output, arc) && !collectAll) {
|
||||
// We have enough suggestions to return immediately. Keep on looking for an
|
||||
// exact match, if requested.
|
||||
if (exactMatchFirst) {
|
||||
if (!checkExistingAndReorder(res, key)) {
|
||||
Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key);
|
||||
if (exactMatchWeight != null) {
|
||||
// Insert as the first result and truncate at num.
|
||||
while (res.size() >= num) {
|
||||
res.remove(res.size() - 1);
|
||||
}
|
||||
res.add(0, new LookupResult(key, exactMatchWeight));
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the list of {@link LookupResult}s already has a <code>key</code>. If so,
|
||||
* reorders that {@link LookupResult} to the first position.
|
||||
*
|
||||
* @return Returns <code>true<code> if and only if <code>list</code> contained <code>key</code>.
|
||||
*/
|
||||
private boolean checkExistingAndReorder(ArrayList<LookupResult> list, String key) {
|
||||
// We assume list does not have duplicates (because of how the FST is created).
|
||||
for (int i = list.size(); --i >= 0;) {
|
||||
if (key.equals(list.get(i).key)) {
|
||||
// Key found. Unless already at i==0, remove it and push up front so that the ordering
|
||||
// remains identical with the exception of the exact match.
|
||||
list.add(0, list.remove(i));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Descend along the path starting at <code>arc</code> and going through
|
||||
* bytes in <code>utf8</code> argument.
|
||||
*
|
||||
* @param arc The starting arc. This argument is modified in-place.
|
||||
* @param term The term to descend with.
|
||||
* @return If <code>true</code>, <code>arc</code> will be set to the arc matching
|
||||
* last byte of <code>utf8</code>. <code>false</code> is returned if no such
|
||||
* prefix <code>utf8</code> exists.
|
||||
*/
|
||||
private boolean descendWithPrefix(Arc<Object> arc, String term) throws IOException {
|
||||
final int max = term.length();
|
||||
|
||||
for (int i = 0; i < max; i++) {
|
||||
if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) {
|
||||
// No matching prefixes, return an empty result.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursive collect lookup results from the automaton subgraph starting at <code>arc</code>.
|
||||
*
|
||||
* @param num Maximum number of results needed (early termination).
|
||||
* @param weight Weight of all results found during this collection.
|
||||
*/
|
||||
private boolean collect(List<LookupResult> res, int num, float weight, StringBuilder output, Arc<Object> arc) throws IOException {
|
||||
output.append((char) arc.label);
|
||||
|
||||
automaton.readFirstTargetArc(arc, arc);
|
||||
while (true) {
|
||||
if (arc.label == FST.END_LABEL) {
|
||||
res.add(new LookupResult(output.toString(), weight));
|
||||
if (res.size() >= num)
|
||||
return true;
|
||||
} else {
|
||||
int save = output.length();
|
||||
if (collect(res, num, weight, output, new Arc<Object>().copyFrom(arc))) {
|
||||
return true;
|
||||
}
|
||||
output.setLength(save);
|
||||
}
|
||||
|
||||
if (arc.isLast()) {
|
||||
break;
|
||||
}
|
||||
automaton.readNextArc(arc);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds the final automaton from a list of entries.
|
||||
*/
|
||||
private FST<Object> buildAutomaton(List<Entry> entries) throws IOException {
|
||||
if (entries.size() == 0)
|
||||
return null;
|
||||
|
||||
// Sort by utf16 (raw char value)
|
||||
final Comparator<Entry> comp = new Comparator<Entry>() {
|
||||
public int compare(Entry o1, Entry o2) {
|
||||
char [] ch1 = o1.term;
|
||||
char [] ch2 = o2.term;
|
||||
int len1 = ch1.length;
|
||||
int len2 = ch2.length;
|
||||
|
||||
int max = Math.min(len1, len2);
|
||||
for (int i = 0; i < max; i++) {
|
||||
int v = ch1[i] - ch2[i];
|
||||
if (v != 0) return v;
|
||||
}
|
||||
return len1 - len2;
|
||||
}
|
||||
};
|
||||
Collections.sort(entries, comp);
|
||||
|
||||
// Avoid duplicated identical entries, if possible. This is required because
|
||||
// it breaks automaton construction otherwise.
|
||||
int len = entries.size();
|
||||
int j = 0;
|
||||
for (int i = 1; i < len; i++) {
|
||||
if (comp.compare(entries.get(j), entries.get(i)) != 0) {
|
||||
entries.set(++j, entries.get(i));
|
||||
}
|
||||
}
|
||||
entries = entries.subList(0, j + 1);
|
||||
|
||||
// Build the automaton.
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Object empty = outputs.getNoOutput();
|
||||
final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
final IntsRef scratchIntsRef = new IntsRef(10);
|
||||
for (Entry e : entries) {
|
||||
final int termLength = scratchIntsRef.length = e.term.length;
|
||||
|
||||
scratchIntsRef.grow(termLength);
|
||||
final int [] ints = scratchIntsRef.ints;
|
||||
final char [] chars = e.term;
|
||||
for (int i = termLength; --i >= 0;) {
|
||||
ints[i] = chars[i];
|
||||
}
|
||||
builder.add(scratchIntsRef, empty);
|
||||
}
|
||||
return builder.finish();
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepends the entry's weight to each entry, encoded as a single byte, so that the
|
||||
* root automaton node fans out to all possible priorities, starting with the arc that has
|
||||
* the highest weights.
|
||||
*/
|
||||
private void encodeWeightPrefix(List<Entry> entries) {
|
||||
for (Entry e : entries) {
|
||||
int weight = (int) e.weight;
|
||||
assert (weight >= 0 && weight <= buckets) :
|
||||
"Weight out of range: " + weight + " [" + buckets + "]";
|
||||
|
||||
// There should be a single empty char reserved in front for the weight.
|
||||
e.term[0] = (char) weight;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Split [min, max] range into buckets, reassigning weights. Entries' weights are
|
||||
* remapped to [0, buckets] range (so, buckets + 1 buckets, actually).
|
||||
*/
|
||||
private void redistributeWeightsProportionalMinMax(List<Entry> entries, int buckets) {
|
||||
float min = entries.get(0).weight;
|
||||
float max = min;
|
||||
for (Entry e : entries) {
|
||||
min = Math.min(e.weight, min);
|
||||
max = Math.max(e.weight, max);
|
||||
}
|
||||
|
||||
final float range = max - min;
|
||||
for (Entry e : entries) {
|
||||
e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Deserialization from disk.
|
||||
*/
|
||||
@Override
|
||||
public synchronized boolean load(File storeDir) throws IOException {
|
||||
File data = new File(storeDir, FILENAME);
|
||||
if (!data.exists() || !data.canRead()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
InputStream is = new BufferedInputStream(new FileInputStream(data));
|
||||
try {
|
||||
this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton());
|
||||
cacheRootArcs();
|
||||
} finally {
|
||||
IOUtils.close(is);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialization to disk.
|
||||
*/
|
||||
@Override
|
||||
public synchronized boolean store(File storeDir) throws IOException {
|
||||
if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (this.automaton == null)
|
||||
return false;
|
||||
|
||||
File data = new File(storeDir, FILENAME);
|
||||
OutputStream os = new BufferedOutputStream(new FileOutputStream(data));
|
||||
try {
|
||||
this.automaton.save(new OutputStreamDataOutput(os));
|
||||
} finally {
|
||||
IOUtils.close(os);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import org.apache.lucene.util.NumericUtils;
|
||||
|
||||
/**
|
||||
* Converts normalized float representations ({@link Float#floatToIntBits(float)})
|
||||
* into integers that are directly sortable in int4 representation (or unsigned values or
|
||||
* after promoting to a long with higher 32-bits zeroed).
|
||||
*/
|
||||
class FloatMagic {
|
||||
/**
|
||||
* Convert a float to a directly sortable unsigned integer. For sortable signed
|
||||
* integers, see {@link NumericUtils#floatToSortableInt(float)}.
|
||||
*/
|
||||
public static int toSortable(float f) {
|
||||
return floatBitsToUnsignedOrdered(Float.floatToRawIntBits(f));
|
||||
}
|
||||
|
||||
/**
|
||||
* Back from {@link #toSortable(float)} to float.
|
||||
*/
|
||||
public static float fromSortable(int v) {
|
||||
return Float.intBitsToFloat(unsignedOrderedToFloatBits(v));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert float bits to directly sortable bits.
|
||||
* Normalizes all NaNs to canonical form.
|
||||
*/
|
||||
static int floatBitsToUnsignedOrdered(int v) {
|
||||
// Canonicalize NaN ranges. I assume this check will be faster here than
|
||||
// (v == v) == false on the FPU? We don't distinguish between different
|
||||
// flavors of NaNs here (see http://en.wikipedia.org/wiki/NaN). I guess
|
||||
// in Java this doesn't matter much anyway.
|
||||
if ((v & 0x7fffffff) > 0x7f800000) {
|
||||
// Apply the logic below to a canonical "quiet NaN"
|
||||
return 0x7fc00000 ^ 0x80000000;
|
||||
}
|
||||
|
||||
if (v < 0) {
|
||||
// Reverse the order of negative values and push them before positive values.
|
||||
return ~v;
|
||||
} else {
|
||||
// Shift positive values after negative, but before NaNs, they're sorted already.
|
||||
return v ^ 0x80000000;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Back from {@link #floatBitsToUnsignedOrdered(int)}.
|
||||
*/
|
||||
static int unsignedOrderedToFloatBits(int v) {
|
||||
if (v < 0)
|
||||
return v & ~0x80000000;
|
||||
else
|
||||
return ~v;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* An {@link BytesRefSorter} that keeps all the entries in memory.
|
||||
*/
|
||||
public final class InMemorySorter implements BytesRefSorter {
|
||||
// TODO: use a single byte[] to back up all entries?
|
||||
private final ArrayList<BytesRef> refs = new ArrayList<BytesRef>();
|
||||
|
||||
private boolean closed = false;
|
||||
|
||||
@Override
|
||||
public void add(BytesRef utf8) {
|
||||
if (closed) throw new IllegalStateException();
|
||||
refs.add(BytesRef.deepCopyOf(utf8));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<BytesRef> iterator() {
|
||||
closed = true;
|
||||
Collections.sort(refs, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
return Collections.unmodifiableCollection(refs).iterator();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,483 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.util.*;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
// TODO: the buffer is currently byte[][] which with very small arrays will terribly overallocate
|
||||
// memory (alignments) and make GC very happy.
|
||||
//
|
||||
// We could move it to a single byte[] + and use custom sorting, but we'd need to check if this
|
||||
// yields any improvement first.
|
||||
|
||||
/**
|
||||
* On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following
|
||||
* fields:
|
||||
* <ul>
|
||||
* <li>(two bytes) length of the following byte array,
|
||||
* <li>exactly the above count of bytes for the sequence to be sorted.
|
||||
* </ul>
|
||||
*
|
||||
* @see #sort(File, File)
|
||||
*/
|
||||
public final class Sort {
|
||||
public final static int MB = 1024 * 1024;
|
||||
public final static int GB = MB * 1024;
|
||||
|
||||
/**
|
||||
* Minimum recommended buffer size for sorting.
|
||||
*/
|
||||
public final static int MIN_BUFFER_SIZE_MB = 32;
|
||||
|
||||
/**
|
||||
* Maximum number of temporary files before doing an intermediate merge.
|
||||
*/
|
||||
public final static int MAX_TEMPFILES = 128;
|
||||
|
||||
/**
|
||||
* Minimum slot buffer expansion.
|
||||
*/
|
||||
private final static int MIN_EXPECTED_GROWTH = 1000;
|
||||
|
||||
/**
|
||||
* A bit more descriptive unit for constructors.
|
||||
*
|
||||
* @see #automatic()
|
||||
* @see #megabytes(int)
|
||||
*/
|
||||
public static final class BufferSize {
|
||||
final int bytes;
|
||||
|
||||
private BufferSize(long bytes) {
|
||||
if (bytes > Integer.MAX_VALUE) {
|
||||
throw new IllegalArgumentException("Buffer too large for Java ("
|
||||
+ (Integer.MAX_VALUE / MB) + "mb max): " + bytes);
|
||||
}
|
||||
|
||||
this.bytes = (int) bytes;
|
||||
}
|
||||
|
||||
public static BufferSize megabytes(int mb) {
|
||||
return new BufferSize(mb * MB);
|
||||
}
|
||||
|
||||
/**
|
||||
* Approximately half of the currently available free heap, but no less
|
||||
* than {@link #MIN_BUFFER_SIZE_MB}.
|
||||
*/
|
||||
public static BufferSize automatic() {
|
||||
long freeHeap = Runtime.getRuntime().freeMemory();
|
||||
return new BufferSize(Math.min(MIN_BUFFER_SIZE_MB * MB, freeHeap / 2));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* byte[] in unsigned byte order.
|
||||
*/
|
||||
static final Comparator<byte[]> unsignedByteOrderComparator = new Comparator<byte[]>() {
|
||||
public int compare(byte[] left, byte[] right) {
|
||||
final int max = Math.min(left.length, right.length);
|
||||
for (int i = 0, j = 0; i < max; i++, j++) {
|
||||
int diff = (left[i] & 0xff) - (right[j] & 0xff);
|
||||
if (diff != 0)
|
||||
return diff;
|
||||
}
|
||||
return left.length - right.length;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Sort info (debugging mostly).
|
||||
*/
|
||||
public class SortInfo {
|
||||
public int tempMergeFiles;
|
||||
public int mergeRounds;
|
||||
public int lines;
|
||||
public long mergeTime;
|
||||
public long sortTime;
|
||||
public long totalTime;
|
||||
public long readTime;
|
||||
public final long bufferSize = ramBufferSize.bytes;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format(Locale.ENGLISH,
|
||||
"time=%.2f sec. total (%.2f reading, %.2f sorting, %.2f merging), lines=%d, temp files=%d, merges=%d, soft ram limit=%.2f MB",
|
||||
totalTime / 1000.0d, readTime / 1000.0d, sortTime / 1000.0d, mergeTime / 1000.0d,
|
||||
lines, tempMergeFiles, mergeRounds,
|
||||
(double) bufferSize / MB);
|
||||
}
|
||||
}
|
||||
|
||||
private final static byte [][] EMPTY = new byte [0][];
|
||||
|
||||
private final BufferSize ramBufferSize;
|
||||
private final File tempDirectory;
|
||||
|
||||
private byte [][] buffer = new byte [0][];
|
||||
private SortInfo sortInfo;
|
||||
private int maxTempFiles;
|
||||
|
||||
/**
|
||||
* Defaults constructor.
|
||||
*
|
||||
* @see #defaultTempDir()
|
||||
* @see BufferSize#automatic()
|
||||
*/
|
||||
public Sort() throws IOException {
|
||||
this(BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
|
||||
}
|
||||
|
||||
/**
|
||||
* All-details constructor.
|
||||
*/
|
||||
public Sort(BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) {
|
||||
if (ramBufferSize.bytes < 1024 * 1024 / 2) {
|
||||
// Half-meg buffer is the absolute minimum.
|
||||
throw new IllegalArgumentException("At least 0.5MB RAM buffer is needed: "
|
||||
+ ramBufferSize.bytes);
|
||||
}
|
||||
|
||||
if (maxTempfiles < 2) {
|
||||
throw new IllegalArgumentException("maxTempFiles must be >= 2");
|
||||
}
|
||||
|
||||
this.ramBufferSize = ramBufferSize;
|
||||
this.tempDirectory = tempDirectory;
|
||||
this.maxTempFiles = maxTempfiles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort input to output, explicit hint for the buffer size. The amount of allocated
|
||||
* memory may deviate from the hint (may be smaller or larger).
|
||||
*/
|
||||
public SortInfo sort(File input, File output) throws IOException {
|
||||
sortInfo = new SortInfo();
|
||||
sortInfo.totalTime = System.currentTimeMillis();
|
||||
|
||||
output.delete();
|
||||
|
||||
ArrayList<File> merges = new ArrayList<File>();
|
||||
ByteSequencesReader is = new ByteSequencesReader(input);
|
||||
boolean success = false;
|
||||
try {
|
||||
int lines = 0;
|
||||
while ((lines = readPartition(is)) > 0) {
|
||||
merges.add(sortPartition(lines));
|
||||
sortInfo.tempMergeFiles++;
|
||||
sortInfo.lines += lines;
|
||||
|
||||
// Handle intermediate merges.
|
||||
if (merges.size() == maxTempFiles) {
|
||||
File intermediate = File.createTempFile("sort", "intermediate", tempDirectory);
|
||||
mergePartitions(merges, intermediate);
|
||||
for (File file : merges) {
|
||||
file.delete();
|
||||
}
|
||||
merges.clear();
|
||||
merges.add(intermediate);
|
||||
sortInfo.tempMergeFiles++;
|
||||
}
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success)
|
||||
IOUtils.close(is);
|
||||
else
|
||||
IOUtils.closeWhileHandlingException(is);
|
||||
}
|
||||
|
||||
// One partition, try to rename or copy if unsuccessful.
|
||||
if (merges.size() == 1) {
|
||||
// If simple rename doesn't work this means the output is
|
||||
// on a different volume or something. Copy the input then.
|
||||
if (!merges.get(0).renameTo(output)) {
|
||||
copy(merges.get(0), output);
|
||||
}
|
||||
} else {
|
||||
// otherwise merge the partitions with a priority queue.
|
||||
mergePartitions(merges, output);
|
||||
for (File file : merges) {
|
||||
file.delete();
|
||||
}
|
||||
}
|
||||
|
||||
sortInfo.totalTime = (System.currentTimeMillis() - sortInfo.totalTime);
|
||||
return sortInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the default temporary directory. By default, java.io.tmpdir. If not accessible
|
||||
* or not available, an IOException is thrown
|
||||
*/
|
||||
public static File defaultTempDir() throws IOException {
|
||||
String tempDirPath = System.getProperty("java.io.tmpdir");
|
||||
if (tempDirPath == null)
|
||||
throw new IOException("Java has no temporary folder property (java.io.tmpdir)?");
|
||||
|
||||
File tempDirectory = new File(tempDirPath);
|
||||
if (!tempDirectory.exists() || !tempDirectory.canWrite()) {
|
||||
throw new IOException("Java's temporary folder not present or writeable?: "
|
||||
+ tempDirectory.getAbsolutePath());
|
||||
}
|
||||
return tempDirectory;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies one file to another.
|
||||
*/
|
||||
private static void copy(File file, File output) throws IOException {
|
||||
// 64kb copy buffer (empirical pick).
|
||||
byte [] buffer = new byte [16 * 1024];
|
||||
InputStream is = null;
|
||||
OutputStream os = null;
|
||||
try {
|
||||
is = new FileInputStream(file);
|
||||
os = new FileOutputStream(output);
|
||||
int length;
|
||||
while ((length = is.read(buffer)) > 0) {
|
||||
os.write(buffer, 0, length);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.close(is, os);
|
||||
}
|
||||
}
|
||||
|
||||
/** Sort a single partition in-memory. */
|
||||
protected File sortPartition(int len) throws IOException {
|
||||
byte [][] data = this.buffer;
|
||||
File tempFile = File.createTempFile("sort", "partition", tempDirectory);
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
Arrays.sort(data, 0, len, unsignedByteOrderComparator);
|
||||
sortInfo.sortTime += (System.currentTimeMillis() - start);
|
||||
|
||||
ByteSequencesWriter out = new ByteSequencesWriter(tempFile);
|
||||
try {
|
||||
for (int i = 0; i < len; i++) {
|
||||
assert data[i].length <= Short.MAX_VALUE;
|
||||
out.write(data[i]);
|
||||
}
|
||||
out.close();
|
||||
|
||||
// Clean up the buffer for the next partition.
|
||||
this.buffer = EMPTY;
|
||||
return tempFile;
|
||||
} finally {
|
||||
IOUtils.close(out);
|
||||
}
|
||||
}
|
||||
|
||||
/** Merge a list of sorted temporary files (partitions) into an output file */
|
||||
void mergePartitions(List<File> merges, File outputFile) throws IOException {
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
ByteSequencesWriter out = new ByteSequencesWriter(outputFile);
|
||||
|
||||
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(merges.size()) {
|
||||
protected boolean lessThan(FileAndTop a, FileAndTop b) {
|
||||
return a.current.compareTo(b.current) < 0;
|
||||
}
|
||||
};
|
||||
|
||||
ByteSequencesReader [] streams = new ByteSequencesReader [merges.size()];
|
||||
try {
|
||||
// Open streams and read the top for each file
|
||||
for (int i = 0; i < merges.size(); i++) {
|
||||
streams[i] = new ByteSequencesReader(merges.get(i));
|
||||
byte line[] = streams[i].read();
|
||||
if (line != null) {
|
||||
queue.insertWithOverflow(new FileAndTop(i, line));
|
||||
}
|
||||
}
|
||||
|
||||
// Unix utility sort() uses ordered array of files to pick the next line from, updating
|
||||
// it as it reads new lines. The PQ used here is a more elegant solution and has
|
||||
// a nicer theoretical complexity bound :) The entire sorting process is I/O bound anyway
|
||||
// so it shouldn't make much of a difference (didn't check).
|
||||
FileAndTop top;
|
||||
while ((top = queue.top()) != null) {
|
||||
out.write(top.current);
|
||||
if (!streams[top.fd].read(top.current)) {
|
||||
queue.pop();
|
||||
} else {
|
||||
queue.updateTop();
|
||||
}
|
||||
}
|
||||
|
||||
sortInfo.mergeTime += System.currentTimeMillis() - start;
|
||||
sortInfo.mergeRounds++;
|
||||
} finally {
|
||||
// The logic below is: if an exception occurs in closing out, it has a priority over exceptions
|
||||
// happening in closing streams.
|
||||
try {
|
||||
IOUtils.close(streams);
|
||||
} finally {
|
||||
IOUtils.close(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Read in a single partition of data */
|
||||
int readPartition(ByteSequencesReader reader) throws IOException {
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
// We will be reallocating from scratch.
|
||||
Arrays.fill(this.buffer, null);
|
||||
|
||||
int bytesLimit = this.ramBufferSize.bytes;
|
||||
byte [][] data = this.buffer;
|
||||
byte[] line;
|
||||
int linesRead = 0;
|
||||
while ((line = reader.read()) != null) {
|
||||
if (linesRead + 1 >= data.length) {
|
||||
data = Arrays.copyOf(data,
|
||||
ArrayUtil.oversize(linesRead + MIN_EXPECTED_GROWTH,
|
||||
RamUsageEstimator.NUM_BYTES_OBJECT_REF));
|
||||
}
|
||||
data[linesRead++] = line;
|
||||
|
||||
// Account for the created objects.
|
||||
// (buffer slots do not account to buffer size.)
|
||||
bytesLimit -= line.length + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER;
|
||||
if (bytesLimit < 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
this.buffer = data;
|
||||
|
||||
sortInfo.readTime += (System.currentTimeMillis() - start);
|
||||
return linesRead;
|
||||
}
|
||||
|
||||
static class FileAndTop {
|
||||
final int fd;
|
||||
final BytesRef current;
|
||||
|
||||
FileAndTop(int fd, byte [] firstLine) {
|
||||
this.fd = fd;
|
||||
this.current = new BytesRef(firstLine);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility class to emit length-prefixed byte[] entries to an output stream for sorting.
|
||||
* Complementary to {@link ByteSequencesReader}.
|
||||
*/
|
||||
public static class ByteSequencesWriter implements Closeable {
|
||||
private final DataOutput os;
|
||||
|
||||
public ByteSequencesWriter(File file) throws IOException {
|
||||
this(new DataOutputStream(
|
||||
new BufferedOutputStream(
|
||||
new FileOutputStream(file))));
|
||||
}
|
||||
|
||||
public ByteSequencesWriter(DataOutput os) {
|
||||
this.os = os;
|
||||
}
|
||||
|
||||
public void write(BytesRef ref) throws IOException {
|
||||
assert ref != null;
|
||||
write(ref.bytes, ref.offset, ref.length);
|
||||
}
|
||||
|
||||
public void write(byte [] bytes) throws IOException {
|
||||
write(bytes, 0, bytes.length);
|
||||
}
|
||||
|
||||
public void write(byte [] bytes, int off, int len) throws IOException {
|
||||
assert bytes != null;
|
||||
assert off >= 0 && off + len <= bytes.length;
|
||||
assert len >= 0;
|
||||
os.writeShort(len);
|
||||
os.write(bytes, off, len);
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the provided {@link DataOutput} if it is {@link Closeable}.
|
||||
*/
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (os instanceof Closeable) {
|
||||
((Closeable) os).close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility class to read length-prefixed byte[] entries from an input.
|
||||
* Complementary to {@link ByteSequencesWriter}.
|
||||
*/
|
||||
public static class ByteSequencesReader implements Closeable {
|
||||
private final DataInput is;
|
||||
|
||||
public ByteSequencesReader(File file) throws IOException {
|
||||
this(new DataInputStream(
|
||||
new BufferedInputStream(
|
||||
new FileInputStream(file))));
|
||||
}
|
||||
|
||||
public ByteSequencesReader(DataInput is) {
|
||||
this.is = is;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next entry into the provided {@link BytesRef}. The internal
|
||||
* storage is resized if needed.
|
||||
*
|
||||
* @return Returns <code>false</code> if EOF occurred when trying to read
|
||||
* the header of the next sequence. Returns <code>true</code> otherwise.
|
||||
* @throws EOFException if the file ends before the full sequence is read.
|
||||
*/
|
||||
public boolean read(BytesRef ref) throws IOException {
|
||||
short length;
|
||||
try {
|
||||
length = is.readShort();
|
||||
} catch (EOFException e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ref.grow(length);
|
||||
ref.offset = 0;
|
||||
ref.length = length;
|
||||
is.readFully(ref.bytes, 0, length);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next entry and returns it if successful.
|
||||
*
|
||||
* @see #read(BytesRef)
|
||||
*
|
||||
* @return Returns <code>null</code> if EOF occurred before the next entry
|
||||
* could be read.
|
||||
* @throws EOFException if the file ends before the full sequence is read.
|
||||
*/
|
||||
public byte[] read() throws IOException {
|
||||
short length;
|
||||
try {
|
||||
length = is.readShort();
|
||||
} catch (EOFException e) {
|
||||
return null;
|
||||
}
|
||||
|
||||
assert length >= 0 : "Sanity: sequence length < 0: " + length;
|
||||
byte [] result = new byte [length];
|
||||
is.readFully(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the provided {@link DataInput} if it is {@link Closeable}.
|
||||
*/
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (is instanceof Closeable) {
|
||||
((Closeable) is).close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -29,10 +29,9 @@ import java.util.Locale;
|
|||
import java.util.Random;
|
||||
import java.util.concurrent.Callable;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.*;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.fst.FSTLookup;
|
||||
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
|
||||
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
|
||||
import org.apache.lucene.search.suggest.tst.TSTLookup;
|
||||
|
||||
|
@ -48,7 +47,7 @@ public class LookupBenchmarkTest extends LuceneTestCase {
|
|||
private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(
|
||||
JaspellLookup.class,
|
||||
TSTLookup.class,
|
||||
FSTLookup.class);
|
||||
FSTCompletionLookup.class);
|
||||
|
||||
private final static int rounds = 15;
|
||||
private final static int warmup = 5;
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.search.suggest;
|
|||
import java.io.File;
|
||||
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.fst.FSTLookup;
|
||||
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
|
||||
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
|
||||
import org.apache.lucene.search.suggest.tst.TSTLookup;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -51,7 +51,7 @@ public class PersistenceTest extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testFSTPersistence() throws Exception {
|
||||
runTest(FSTLookup.class, false);
|
||||
runTest(FSTCompletionLookup.class, false);
|
||||
}
|
||||
|
||||
private void runTest(Class<? extends Lookup> lookupClass,
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
public class BytesRefSortersTest extends LuceneTestCase {
|
||||
@Test
|
||||
public void testExternalRefSorter() throws Exception {
|
||||
check(new ExternalRefSorter(new Sort()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInMemorySorter() throws Exception {
|
||||
check(new InMemorySorter());
|
||||
}
|
||||
|
||||
private void check(BytesRefSorter sorter) throws Exception {
|
||||
for (int i = 0; i < 100; i++) {
|
||||
byte [] current = new byte [random.nextInt(256)];
|
||||
random.nextBytes(current);
|
||||
sorter.add(new BytesRef(current));
|
||||
}
|
||||
|
||||
// Create two iterators and check that they're aligned with each other.
|
||||
Iterator<BytesRef> i1 = sorter.iterator();
|
||||
Iterator<BytesRef> i2 = sorter.iterator();
|
||||
|
||||
// Verify sorter contract.
|
||||
try {
|
||||
sorter.add(new BytesRef(new byte [1]));
|
||||
fail("expected contract violation.");
|
||||
} catch (IllegalStateException e) {
|
||||
// Expected.
|
||||
}
|
||||
|
||||
while (i1.hasNext() && i2.hasNext()) {
|
||||
assertEquals(i1.next(), i2.next());
|
||||
}
|
||||
assertEquals(i1.hasNext(), i2.hasNext());
|
||||
}
|
||||
}
|
|
@ -17,40 +17,38 @@ package org.apache.lucene.search.suggest.fst;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Random;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
||||
import org.apache.lucene.search.suggest.fst.FSTLookup;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import org.apache.lucene.search.suggest.LookupBenchmarkTest;
|
||||
import org.apache.lucene.search.suggest.TermFreq;
|
||||
import org.apache.lucene.search.suggest.TermFreqArrayIterator;
|
||||
import org.apache.lucene.search.suggest.*;
|
||||
import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion;
|
||||
import org.apache.lucene.util.*;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link FSTLookup}.
|
||||
* Unit tests for {@link FSTCompletion}.
|
||||
*/
|
||||
public class FSTLookupTest extends LuceneTestCase {
|
||||
public class FSTCompletionTest extends LuceneTestCase {
|
||||
public static TermFreq tf(String t, float v) {
|
||||
return new TermFreq(t, v);
|
||||
}
|
||||
|
||||
private FSTLookup lookup;
|
||||
private FSTCompletion completion;
|
||||
private FSTCompletion completionAlphabetical;
|
||||
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
||||
lookup = new FSTLookup();
|
||||
lookup.build(new TermFreqArrayIterator(evalKeys()));
|
||||
FSTCompletionBuilder builder = new FSTCompletionBuilder();
|
||||
for (TermFreq tf : evalKeys()) {
|
||||
builder.add(new BytesRef(tf.term), (int) tf.v);
|
||||
}
|
||||
completion = builder.build();
|
||||
completionAlphabetical = new FSTCompletion(completion.getFST(), false, true);
|
||||
}
|
||||
|
||||
private TermFreq[] evalKeys() {
|
||||
final TermFreq[] keys = new TermFreq[] {
|
||||
tf("one", 0.5f),
|
||||
tf("one", 0),
|
||||
tf("oneness", 1),
|
||||
tf("onerous", 1),
|
||||
tf("onesimus", 1),
|
||||
|
@ -64,103 +62,152 @@ public class FSTLookupTest extends LuceneTestCase {
|
|||
tf("foundation", 1),
|
||||
tf("fourblah", 1),
|
||||
tf("fourteen", 1),
|
||||
tf("four", 0.5f),
|
||||
tf("fourier", 0.5f),
|
||||
tf("fourty", 0.5f),
|
||||
tf("four", 0f),
|
||||
tf("fourier", 0f),
|
||||
tf("fourty", 0f),
|
||||
tf("xo", 1),
|
||||
};
|
||||
return keys;
|
||||
}
|
||||
|
||||
public void testExactMatchHighPriority() throws Exception {
|
||||
assertMatchEquals(lookup.lookup("two", true, 1), "two/1.0");
|
||||
assertMatchEquals(completion.lookup("two", 1),
|
||||
"two/1.0");
|
||||
}
|
||||
|
||||
public void testExactMatchLowPriority() throws Exception {
|
||||
assertMatchEquals(lookup.lookup("one", true, 2),
|
||||
assertMatchEquals(completion.lookup("one", 2),
|
||||
"one/0.0",
|
||||
"oneness/1.0");
|
||||
}
|
||||
|
||||
public void testExactMatchReordering() throws Exception {
|
||||
// Check reordering of exact matches.
|
||||
assertMatchEquals(completion.lookup("four", 4),
|
||||
"four/0.0",
|
||||
"fourblah/1.0",
|
||||
"fourteen/1.0",
|
||||
"fourier/0.0");
|
||||
}
|
||||
|
||||
public void testRequestedCount() throws Exception {
|
||||
// 'one' is promoted after collecting two higher ranking results.
|
||||
assertMatchEquals(lookup.lookup("one", true, 2),
|
||||
"one/0.0",
|
||||
"oneness/1.0");
|
||||
|
||||
// 'one' is at the top after collecting all alphabetical results.
|
||||
assertMatchEquals(lookup.lookup("one", false, 2),
|
||||
assertMatchEquals(completion.lookup("one", 2),
|
||||
"one/0.0",
|
||||
"oneness/1.0");
|
||||
|
||||
// 'four' is collected in a bucket and then again as an exact match.
|
||||
assertMatchEquals(lookup.lookup("four", true, 2),
|
||||
assertMatchEquals(completion.lookup("four", 2),
|
||||
"four/0.0",
|
||||
"fourblah/1.0");
|
||||
|
||||
// Check reordering of exact matches.
|
||||
assertMatchEquals(lookup.lookup("four", true, 4),
|
||||
assertMatchEquals(completion.lookup("four", 4),
|
||||
"four/0.0",
|
||||
"fourblah/1.0",
|
||||
"fourteen/1.0",
|
||||
"fourier/0.0");
|
||||
|
||||
lookup = new FSTLookup(10, false);
|
||||
lookup.build(new TermFreqArrayIterator(evalKeys()));
|
||||
// 'one' is at the top after collecting all alphabetical results.
|
||||
assertMatchEquals(completionAlphabetical.lookup("one", 2),
|
||||
"one/0.0",
|
||||
"oneness/1.0");
|
||||
|
||||
// 'one' is not promoted after collecting two higher ranking results.
|
||||
assertMatchEquals(lookup.lookup("one", true, 2),
|
||||
FSTCompletion noPromotion = new FSTCompletion(completion.getFST(), true, false);
|
||||
assertMatchEquals(noPromotion.lookup("one", 2),
|
||||
"oneness/1.0",
|
||||
"onerous/1.0");
|
||||
|
||||
// 'one' is at the top after collecting all alphabetical results.
|
||||
assertMatchEquals(lookup.lookup("one", false, 2),
|
||||
assertMatchEquals(completionAlphabetical.lookup("one", 2),
|
||||
"one/0.0",
|
||||
"oneness/1.0");
|
||||
}
|
||||
|
||||
public void testMiss() throws Exception {
|
||||
assertMatchEquals(lookup.lookup("xyz", true, 1));
|
||||
assertMatchEquals(completion.lookup("xyz", 1));
|
||||
}
|
||||
|
||||
public void testAlphabeticWithWeights() throws Exception {
|
||||
assertEquals(0, lookup.lookup("xyz", false, 1).size());
|
||||
assertEquals(0, completionAlphabetical.lookup("xyz", 1).size());
|
||||
}
|
||||
|
||||
public void testFullMatchList() throws Exception {
|
||||
assertMatchEquals(lookup.lookup("one", true, Integer.MAX_VALUE),
|
||||
assertMatchEquals(completion.lookup("one", Integer.MAX_VALUE),
|
||||
"oneness/1.0",
|
||||
"onerous/1.0",
|
||||
"onesimus/1.0",
|
||||
"one/0.0");
|
||||
}
|
||||
|
||||
public void testThreeByte() throws Exception {
|
||||
String key = new String(new byte[] {
|
||||
(byte) 0xF0, (byte) 0xA4, (byte) 0xAD, (byte) 0xA2}, "UTF-8");
|
||||
FSTCompletionBuilder builder = new FSTCompletionBuilder();
|
||||
builder.add(new BytesRef(key), 0);
|
||||
|
||||
FSTCompletion lookup = builder.build();
|
||||
List<Completion> result = lookup.lookup(key, 1);
|
||||
assertEquals(1, result.size());
|
||||
}
|
||||
|
||||
public void testLargeInputConstantWeights() throws Exception {
|
||||
FSTCompletionLookup lookup = new FSTCompletionLookup(10, true);
|
||||
|
||||
Random r = random;
|
||||
List<TermFreq> keys = new ArrayList<TermFreq>();
|
||||
for (int i = 0; i < 5000; i++) {
|
||||
keys.add(new TermFreq(_TestUtil.randomSimpleString(r), -1.0f));
|
||||
}
|
||||
|
||||
lookup.build(new TermFreqArrayIterator(keys));
|
||||
|
||||
// All the weights were constant, so all returned buckets must be constant, whatever they
|
||||
// are.
|
||||
Float previous = null;
|
||||
for (TermFreq tf : keys) {
|
||||
Float current = lookup.get(tf.term);
|
||||
if (previous != null) {
|
||||
assertEquals(previous, current);
|
||||
}
|
||||
previous = current;
|
||||
}
|
||||
}
|
||||
|
||||
@Nightly
|
||||
public void testMultilingualInput() throws Exception {
|
||||
List<TermFreq> input = LookupBenchmarkTest.readTop50KWiki();
|
||||
|
||||
lookup = new FSTLookup();
|
||||
FSTCompletionLookup lookup = new FSTCompletionLookup();
|
||||
lookup.build(new TermFreqArrayIterator(input));
|
||||
|
||||
for (TermFreq tf : input) {
|
||||
assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null);
|
||||
assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key);
|
||||
}
|
||||
|
||||
List<LookupResult> result = lookup.lookup("wit", true, 5);
|
||||
assertEquals(5, result.size());
|
||||
assertTrue(result.get(0).key.equals("wit")); // exact match.
|
||||
assertTrue(result.get(1).key.equals("with")); // highest count.
|
||||
}
|
||||
|
||||
public void testEmptyInput() throws Exception {
|
||||
lookup = new FSTLookup();
|
||||
lookup.build(new TermFreqArrayIterator(new TermFreq[0]));
|
||||
|
||||
assertMatchEquals(lookup.lookup("", true, 10));
|
||||
completion = new FSTCompletionBuilder().build();
|
||||
assertMatchEquals(completion.lookup("", 10));
|
||||
}
|
||||
|
||||
@Nightly
|
||||
public void testRandom() throws Exception {
|
||||
List<TermFreq> freqs = new ArrayList<TermFreq>();
|
||||
Random rnd = random;
|
||||
for (int i = 0; i < 5000; i++) {
|
||||
freqs.add(new TermFreq("" + rnd.nextLong(), rnd.nextInt(100)));
|
||||
for (int i = 0; i < 2500 + rnd.nextInt(2500); i++) {
|
||||
float weight = rnd.nextFloat() * 100;
|
||||
freqs.add(new TermFreq("" + rnd.nextLong(), weight));
|
||||
}
|
||||
lookup = new FSTLookup();
|
||||
|
||||
FSTCompletionLookup lookup = new FSTCompletionLookup();
|
||||
lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()])));
|
||||
|
||||
for (TermFreq tf : freqs) {
|
||||
|
@ -174,12 +221,13 @@ public class FSTLookupTest extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private void assertMatchEquals(List<LookupResult> res, String... expected) {
|
||||
private void assertMatchEquals(List<Completion> res, String... expected) {
|
||||
String [] result = new String [res.size()];
|
||||
for (int i = 0; i < res.size(); i++)
|
||||
for (int i = 0; i < res.size(); i++) {
|
||||
result[i] = res.get(i).toString();
|
||||
}
|
||||
|
||||
if (!Arrays.equals(expected, result)) {
|
||||
if (!Arrays.equals(stripScore(expected), stripScore(result))) {
|
||||
int colLen = Math.max(maxLen(expected), maxLen(result));
|
||||
|
||||
StringBuilder b = new StringBuilder();
|
||||
|
@ -196,6 +244,14 @@ public class FSTLookupTest extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private String[] stripScore(String[] expected) {
|
||||
String [] result = new String [expected.length];
|
||||
for (int i = 0; i < result.length; i++) {
|
||||
result[i] = expected[i].replaceAll("\\/[0-9\\.]+", "");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private int maxLen(String[] result) {
|
||||
int len = 0;
|
||||
for (String s : result)
|
|
@ -0,0 +1,119 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.NumericUtils;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
public class FloatMagicTest extends LuceneTestCase {
|
||||
public void testFloatMagic() {
|
||||
ArrayList<Float> floats = new ArrayList<Float>(Arrays.asList(new Float [] {
|
||||
Float.intBitsToFloat(0x7f800001), // NaN (invalid combination).
|
||||
Float.intBitsToFloat(0x7fffffff), // NaN (invalid combination).
|
||||
Float.intBitsToFloat(0xff800001), // NaN (invalid combination).
|
||||
Float.intBitsToFloat(0xffffffff), // NaN (invalid combination).
|
||||
Float.POSITIVE_INFINITY,
|
||||
Float.MAX_VALUE,
|
||||
100f,
|
||||
0f,
|
||||
0.1f,
|
||||
Float.MIN_VALUE,
|
||||
Float.NaN,
|
||||
-0.0f,
|
||||
-Float.MIN_VALUE,
|
||||
-0.1f,
|
||||
-1f,
|
||||
-10f,
|
||||
Float.NEGATIVE_INFINITY }));
|
||||
|
||||
// Sort them using juc.
|
||||
Collections.sort(floats);
|
||||
|
||||
// Convert to sortable int4 representation (as long to have an unsigned sort).
|
||||
long [] int4 = new long [floats.size()];
|
||||
for (int i = 0; i < floats.size(); i++) {
|
||||
int4[i] = FloatMagic.toSortable(floats.get(i)) & 0xffffffffL;
|
||||
|
||||
System.out.println(
|
||||
String.format("raw %8s sortable %8s %8s numutils %8s %s",
|
||||
Integer.toHexString(Float.floatToRawIntBits(floats.get(i))),
|
||||
Integer.toHexString(FloatMagic.toSortable(floats.get(i))),
|
||||
Integer.toHexString(FloatMagic.unsignedOrderedToFloatBits(FloatMagic.toSortable(floats.get(i)))),
|
||||
Integer.toHexString(NumericUtils.floatToSortableInt(floats.get(i))),
|
||||
floats.get(i)));
|
||||
}
|
||||
|
||||
// Sort and compare. Should be identical order.
|
||||
Arrays.sort(int4);
|
||||
ArrayList<Float> backFromFixed = new ArrayList<Float>();
|
||||
for (int i = 0; i < int4.length; i++) {
|
||||
backFromFixed.add(FloatMagic.fromSortable((int) int4[i]));
|
||||
}
|
||||
|
||||
for (int i = 0; i < int4.length; i++) {
|
||||
System.out.println(
|
||||
floats.get(i) + " " + FloatMagic.fromSortable((int) int4[i]));
|
||||
}
|
||||
|
||||
assertEquals(floats, backFromFixed);
|
||||
}
|
||||
|
||||
@Ignore("Once checked, valid forever?") @Test
|
||||
public void testRoundTripFullRange() {
|
||||
int i = 0;
|
||||
do {
|
||||
float f = Float.intBitsToFloat(i);
|
||||
float f2 = FloatMagic.fromSortable(FloatMagic.toSortable(f));
|
||||
|
||||
if (!((Float.isNaN(f) && Float.isNaN(f2)) || f == f2)) {
|
||||
throw new RuntimeException("! " + Integer.toHexString(i) + "> " + f + " " + f2);
|
||||
}
|
||||
|
||||
if ((i & 0xffffff) == 0) {
|
||||
System.out.println(Integer.toHexString(i));
|
||||
}
|
||||
|
||||
i++;
|
||||
} while (i != 0);
|
||||
}
|
||||
|
||||
@Ignore("Once checked, valid forever?") @Test
|
||||
public void testIncreasingFullRange() {
|
||||
// -infinity ... -0.0
|
||||
for (int i = 0xff800000; i != 0x80000000; i--) {
|
||||
checkSmaller(i, i - 1);
|
||||
}
|
||||
|
||||
// -0.0 +0.0
|
||||
checkSmaller(0x80000000, 0);
|
||||
|
||||
// +0.0 ... +infinity
|
||||
for (int i = 0; i != 0x7f800000; i++) {
|
||||
checkSmaller(i, i + 1);
|
||||
}
|
||||
|
||||
// All other are NaNs and should be after positive infinity.
|
||||
final long infinity = toSortableL(Float.POSITIVE_INFINITY);
|
||||
for (int i = 0x7f800001; i != 0x7fffffff; i++) {
|
||||
assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
|
||||
}
|
||||
for (int i = 0xff800001; i != 0xffffffff; i++) {
|
||||
assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
|
||||
}
|
||||
}
|
||||
|
||||
private long toSortableL(float f) {
|
||||
return FloatMagic.toSortable(f) & 0xffffffffL;
|
||||
}
|
||||
|
||||
private void checkSmaller(int i1, int i2) {
|
||||
float f1 = Float.intBitsToFloat(i1);
|
||||
float f2 = Float.intBitsToFloat(i2);
|
||||
if (f1 > f2) {
|
||||
throw new AssertionError(f1 + " " + f2 + " " + i1 + " " + i2);
|
||||
}
|
||||
assertTrue(toSortableL(f1) < toSortableL(f2));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Try to build a suggester from a large data set. The input is a simple text
|
||||
* file, newline-delimited.
|
||||
*/
|
||||
public class LargeInputFST {
|
||||
public static void main(String[] args) throws IOException {
|
||||
File input = new File("/home/dweiss/tmp/shuffled.dict");
|
||||
|
||||
int buckets = 20;
|
||||
int shareMaxTail = 10;
|
||||
|
||||
ExternalRefSorter sorter = new ExternalRefSorter(new Sort());
|
||||
FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter, shareMaxTail);
|
||||
|
||||
BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(
|
||||
new FileInputStream(input), "UTF-8"));
|
||||
|
||||
BytesRef scratch = new BytesRef();
|
||||
String line;
|
||||
int count = 0;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
scratch.copyChars(line);
|
||||
builder.add(scratch, count % buckets);
|
||||
if ((count++ % 100000) == 0) {
|
||||
System.err.println("Line: " + count);
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Building FSTCompletion.");
|
||||
FSTCompletion completion = builder.build();
|
||||
|
||||
File fstFile = new File("completion.fst");
|
||||
System.out.println("Done. Writing automaton: " + fstFile.getAbsolutePath());
|
||||
completion.getFST().save(fstFile);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.search.suggest.fst.Sort.BufferSize;
|
||||
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
|
||||
import org.apache.lucene.search.suggest.fst.Sort.SortInfo;
|
||||
import org.apache.lucene.util.*;
|
||||
import org.junit.*;
|
||||
|
||||
/**
|
||||
* Tests for on-disk merge sorting.
|
||||
*/
|
||||
public class TestSort extends LuceneTestCase {
|
||||
private File tempDir;
|
||||
|
||||
@Before
|
||||
public void prepareTempDir() throws IOException {
|
||||
tempDir = _TestUtil.getTempDir("mergesort");
|
||||
_TestUtil.rmDir(tempDir);
|
||||
tempDir.mkdirs();
|
||||
}
|
||||
|
||||
@After
|
||||
public void cleanup() throws IOException {
|
||||
if (tempDir != null)
|
||||
_TestUtil.rmDir(tempDir);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmpty() throws Exception {
|
||||
checkSort(new Sort(), new byte [][] {});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSingleLine() throws Exception {
|
||||
checkSort(new Sort(), new byte [][] {
|
||||
"Single line only.".getBytes("UTF-8")
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIntermediateMerges() throws Exception {
|
||||
// Sort 20 mb worth of data with 1mb buffer, binary merging.
|
||||
SortInfo info = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), 2),
|
||||
generateRandom(Sort.MB * 20));
|
||||
assertTrue(info.mergeRounds > 10);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSmallRandom() throws Exception {
|
||||
// Sort 20 mb worth of data with 1mb buffer.
|
||||
SortInfo sortInfo = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
|
||||
generateRandom(Sort.MB * 20));
|
||||
assertEquals(1, sortInfo.mergeRounds);
|
||||
}
|
||||
|
||||
@Test @Nightly
|
||||
public void testLargerRandom() throws Exception {
|
||||
// Sort 100MB worth of data with 15mb buffer.
|
||||
checkSort(new Sort(BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
|
||||
generateRandom(Sort.MB * 100));
|
||||
}
|
||||
|
||||
private byte[][] generateRandom(int howMuchData) {
|
||||
ArrayList<byte[]> data = new ArrayList<byte[]>();
|
||||
while (howMuchData > 0) {
|
||||
byte [] current = new byte [random.nextInt(256)];
|
||||
random.nextBytes(current);
|
||||
data.add(current);
|
||||
howMuchData -= current.length;
|
||||
}
|
||||
byte [][] bytes = data.toArray(new byte[data.size()][]);
|
||||
return bytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check sorting data on an instance of {@link Sort}.
|
||||
*/
|
||||
private SortInfo checkSort(Sort sort, byte[][] data) throws IOException {
|
||||
File unsorted = writeAll("unsorted", data);
|
||||
|
||||
Arrays.sort(data, Sort.unsignedByteOrderComparator);
|
||||
File golden = writeAll("golden", data);
|
||||
|
||||
File sorted = new File(tempDir, "sorted");
|
||||
SortInfo sortInfo = sort.sort(unsorted, sorted);
|
||||
System.out.println("Input size [MB]: " + unsorted.length() / (1024 * 1024));
|
||||
System.out.println(sortInfo);
|
||||
|
||||
assertFilesIdentical(golden, sorted);
|
||||
return sortInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure two files are byte-byte identical.
|
||||
*/
|
||||
private void assertFilesIdentical(File golden, File sorted) throws IOException {
|
||||
assertEquals(golden.length(), sorted.length());
|
||||
|
||||
byte [] buf1 = new byte [64 * 1024 * 1024];
|
||||
byte [] buf2 = new byte [64 * 1024 * 1024];
|
||||
int len;
|
||||
DataInputStream is1 = new DataInputStream(new FileInputStream(golden));
|
||||
DataInputStream is2 = new DataInputStream(new FileInputStream(sorted));
|
||||
while ((len = is1.read(buf1)) > 0) {
|
||||
is2.readFully(buf2, 0, len);
|
||||
for (int i = 0; i < len; i++) {
|
||||
assertEquals(buf1[i], buf2[i]);
|
||||
}
|
||||
}
|
||||
IOUtils.close(is1, is2);
|
||||
}
|
||||
|
||||
private File writeAll(String name, byte[][] data) throws IOException {
|
||||
File file = new File(tempDir, name);
|
||||
ByteSequencesWriter w = new Sort.ByteSequencesWriter(file);
|
||||
for (byte [] datum : data) {
|
||||
w.write(datum);
|
||||
}
|
||||
w.close();
|
||||
return file;
|
||||
}
|
||||
}
|
|
@ -199,6 +199,11 @@ New Features
|
|||
Optimizations
|
||||
----------------------
|
||||
|
||||
* SOLR-2888: FSTSuggester refactoring: internal storage is now UTF-8,
|
||||
external sorting (on disk) prevents OOMs even with large data sets
|
||||
(the bottleneck is now FST construction), code cleanups and API cleanups.
|
||||
(Dawid Weiss)
|
||||
|
||||
* SOLR-1875: Per-segment field faceting for single valued string fields.
|
||||
Enable with facet.method=fcs, control the number of threads used with
|
||||
the "threads" local param on the facet.field param. This algorithm will
|
||||
|
|
|
@ -18,16 +18,15 @@ package org.apache.solr.spelling.suggest.fst;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.fst.FSTLookup;
|
||||
import org.apache.lucene.search.suggest.fst.*;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.spelling.suggest.LookupFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link FSTLookup}
|
||||
* Factory for {@link FSTCompletionLookup}
|
||||
*/
|
||||
public class FSTLookupFactory extends LookupFactory {
|
||||
|
||||
/**
|
||||
* The number of separate buckets for weights (discretization). The more buckets,
|
||||
* the more fine-grained term weights (priorities) can be assigned. The speed of lookup
|
||||
|
@ -55,6 +54,6 @@ public class FSTLookupFactory extends LookupFactory {
|
|||
? Boolean.valueOf(params.get(EXACT_MATCH_FIRST).toString())
|
||||
: true;
|
||||
|
||||
return new FSTLookup(buckets, exactMatchFirst);
|
||||
return new FSTCompletionLookup(buckets, exactMatchFirst);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue