mirror of https://github.com/apache/lucene.git
LUCENE-3807: Cleanup Suggest / Lookup API
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1296268 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2c94c522fd
commit
f303bcd465
|
@ -116,6 +116,13 @@ Changes in backwards compatibility policy
|
|||
and MultiPassIndexSplitter were made private as they now work
|
||||
per segment. (Uwe Schindler)
|
||||
|
||||
* LUCENE-3807: Cleaned up Suggest / Lookup API. Term weights (freqs) are now
|
||||
64bit signed integers instead of 32bit floats. Sorting of terms is now a
|
||||
disk based merge sort instead of an in-memory sort. The Lookup API now
|
||||
accepts and returns CharSequence instead of String which should be converted
|
||||
into a String before used in a datastructure that relies on hashCode / equals.
|
||||
(Simon Willnauer)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-3698: FastVectorHighlighter no longer adds a multi value separator
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
|
||||
/**
|
||||
* This wrapper buffers incoming elements.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BufferingTermFreqIteratorWrapper implements TermFreqIterator {
|
||||
// TODO keep this for now
|
||||
|
|
|
@ -18,81 +18,113 @@ package org.apache.lucene.search.suggest;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.ByteBlockPool;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefIterator;
|
||||
import org.apache.lucene.util.Counter;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.SorterTemplate;
|
||||
|
||||
final class BytesRefList {
|
||||
|
||||
/**
|
||||
* A simple append only random-access {@link BytesRef} array that stores full
|
||||
* copies of the appended bytes in a {@link ByteBlockPool}.
|
||||
*
|
||||
*
|
||||
* <b>Note: This class is not Thread-Safe!</b>
|
||||
*
|
||||
* @lucene.internal
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class BytesRefList {
|
||||
// TODO rename to BytesRefArray
|
||||
private final ByteBlockPool pool;
|
||||
private int[] offsets = new int[1];
|
||||
private int currentElement = 0;
|
||||
private int lastElement = 0;
|
||||
private int currentOffset = 0;
|
||||
private final Counter bytesUsed = Counter.newCounter(false);
|
||||
|
||||
/**
|
||||
* Creates a new {@link BytesRefList}
|
||||
*/
|
||||
public BytesRefList() {
|
||||
this(new ByteBlockPool(new ByteBlockPool.DirectAllocator()));
|
||||
}
|
||||
|
||||
public BytesRefList(ByteBlockPool pool) {
|
||||
this.pool = pool;
|
||||
this.pool = new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(
|
||||
bytesUsed));
|
||||
pool.nextBuffer();
|
||||
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER
|
||||
+ RamUsageEstimator.NUM_BYTES_INT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears this {@link BytesRefList}
|
||||
*/
|
||||
public void clear() {
|
||||
lastElement = 0;
|
||||
currentOffset = 0;
|
||||
Arrays.fill(offsets, 0);
|
||||
pool.reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends a copy of the given {@link BytesRef} to this {@link BytesRefList}.
|
||||
* @param bytes the bytes to append
|
||||
* @return the ordinal of the appended bytes
|
||||
*/
|
||||
public int append(BytesRef bytes) {
|
||||
if (currentElement >= offsets.length) {
|
||||
if (lastElement >= offsets.length) {
|
||||
int oldLen = offsets.length;
|
||||
offsets = ArrayUtil.grow(offsets, offsets.length + 1);
|
||||
bytesUsed.addAndGet((offsets.length - oldLen)
|
||||
* RamUsageEstimator.NUM_BYTES_INT);
|
||||
}
|
||||
pool.copy(bytes);
|
||||
offsets[currentElement++] = currentOffset;
|
||||
offsets[lastElement++] = currentOffset;
|
||||
currentOffset += bytes.length;
|
||||
return currentElement;
|
||||
return lastElement;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current size of this {@link BytesRefList}
|
||||
* @return the current size of this {@link BytesRefList}
|
||||
*/
|
||||
public int size() {
|
||||
return currentElement;
|
||||
return lastElement;
|
||||
}
|
||||
|
||||
public BytesRef get(BytesRef bytes, int pos) {
|
||||
if (currentElement > pos) {
|
||||
bytes.offset = offsets[pos];
|
||||
bytes.length = pos == currentElement - 1 ? currentOffset - bytes.offset
|
||||
: offsets[pos + 1] - bytes.offset;
|
||||
pool.copyFrom(bytes);
|
||||
return bytes;
|
||||
/**
|
||||
* Returns the <i>n'th</i> element of this {@link BytesRefList}
|
||||
* @param spare a spare {@link BytesRef} instance
|
||||
* @param ord the elements ordinal to retrieve
|
||||
* @return the <i>n'th</i> element of this {@link BytesRefList}
|
||||
*/
|
||||
public BytesRef get(BytesRef spare, int ord) {
|
||||
if (lastElement > ord) {
|
||||
spare.offset = offsets[ord];
|
||||
spare.length = ord == lastElement - 1 ? currentOffset - spare.offset
|
||||
: offsets[ord + 1] - spare.offset;
|
||||
pool.copyFrom(spare);
|
||||
return spare;
|
||||
}
|
||||
throw new IndexOutOfBoundsException("index " + pos
|
||||
+ " must be less than the size: " + currentElement);
|
||||
throw new IndexOutOfBoundsException("index " + ord
|
||||
+ " must be less than the size: " + lastElement);
|
||||
|
||||
}
|
||||
|
||||
public BytesRefIterator iterator() {
|
||||
final int numElements = currentElement;
|
||||
|
||||
return new BytesRefIterator() {
|
||||
private final BytesRef spare = new BytesRef();
|
||||
private int pos = 0;
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
if (pos < numElements) {
|
||||
get(spare, pos++);
|
||||
return spare;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
/**
|
||||
* Returns the number internally used bytes to hold the appended bytes in
|
||||
* memory
|
||||
*
|
||||
* @return the number internally used bytes to hold the appended bytes in
|
||||
* memory
|
||||
*/
|
||||
public long bytesUsed() {
|
||||
return bytesUsed.get();
|
||||
}
|
||||
|
||||
public int[] sort(final Comparator<BytesRef> comp) {
|
||||
private int[] sort(final Comparator<BytesRef> comp) {
|
||||
final int[] orderdEntries = new int[size()];
|
||||
for (int i = 0; i < orderdEntries.length; i++) {
|
||||
orderdEntries[i] = i;
|
||||
|
@ -123,9 +155,52 @@ final class BytesRefList {
|
|||
return comp.compare(pivot, get(scratch2, ord));
|
||||
}
|
||||
|
||||
private final BytesRef pivot = new BytesRef(),
|
||||
scratch1 = new BytesRef(), scratch2 = new BytesRef();
|
||||
private final BytesRef pivot = new BytesRef(), scratch1 = new BytesRef(),
|
||||
scratch2 = new BytesRef();
|
||||
}.quickSort(0, size() - 1);
|
||||
return orderdEntries;
|
||||
}
|
||||
|
||||
/**
|
||||
* sugar for {@link #iterator(Comparator)} with a <code>null</code> comparator
|
||||
*/
|
||||
public BytesRefIterator iterator() {
|
||||
return iterator(null);
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Returns a {@link BytesRefIterator} with point in time semantics. The
|
||||
* iterator provides access to all so far appended {@link BytesRef} instances.
|
||||
* </p>
|
||||
* <p>
|
||||
* If a non <code>null</code> {@link Comparator} is provided the iterator will
|
||||
* iterate the byte values in the order specified by the comparator. Otherwise
|
||||
* the order is the same as the values were appended.
|
||||
* </p>
|
||||
* <p>
|
||||
* This is a non-destructive operation.
|
||||
* </p>
|
||||
*/
|
||||
public BytesRefIterator iterator(final Comparator<BytesRef> comp) {
|
||||
final BytesRef spare = new BytesRef();
|
||||
final int size = size();
|
||||
final int[] ords = comp == null ? null : sort(comp);
|
||||
return new BytesRefIterator() {
|
||||
int pos = 0;
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
if (pos < size) {
|
||||
return get(spare, ords == null ? pos++ : ords[pos++]);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return comp;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
@ -75,7 +75,11 @@ public class FileDictionary implements Dictionary {
|
|||
String[] fields = line.split("\t");
|
||||
if (fields.length > 1) {
|
||||
// keep reading floats for bw compat
|
||||
curFreq = (int)Float.parseFloat(fields[1]);
|
||||
try {
|
||||
curFreq = Long.parseLong(fields[1]);
|
||||
} catch (NumberFormatException e) {
|
||||
curFreq = (long)Double.parseDouble(fields[1]);
|
||||
}
|
||||
spare.copyChars(fields[0]);
|
||||
} else {
|
||||
spare.copyChars(line);
|
||||
|
|
|
@ -29,15 +29,19 @@ import org.apache.lucene.search.spell.TermFreqIterator;
|
|||
import org.apache.lucene.util.BytesRefIterator;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* Simple Lookup interface for {@link CharSequence} suggestions.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class Lookup {
|
||||
/**
|
||||
* Result of a lookup.
|
||||
*/
|
||||
public static final class LookupResult implements Comparable<LookupResult> {
|
||||
public final CharSequence key;
|
||||
public final float value;
|
||||
public final long value;
|
||||
|
||||
public LookupResult(CharSequence key, float value) {
|
||||
public LookupResult(CharSequence key, long value) {
|
||||
this.key = key;
|
||||
this.value = value;
|
||||
}
|
||||
|
@ -112,6 +116,10 @@ public abstract class Lookup {
|
|||
build(tfit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds up a new internal {@link Lookup} representation based on the given {@link TermFreqIterator}.
|
||||
* The implementation might re-sort the data internally.
|
||||
*/
|
||||
public abstract void build(TermFreqIterator tfit) throws IOException;
|
||||
|
||||
/**
|
||||
|
@ -124,21 +132,6 @@ public abstract class Lookup {
|
|||
*/
|
||||
public abstract List<LookupResult> lookup(CharSequence key, boolean onlyMorePopular, int num);
|
||||
|
||||
/**
|
||||
* Modify the lookup data by recording additional data. Optional operation.
|
||||
* @param key new lookup key
|
||||
* @param value value to associate with this key
|
||||
* @return true if new key is added, false if it already exists or operation
|
||||
* is not supported.
|
||||
*/
|
||||
public abstract boolean add(CharSequence key, Object value);
|
||||
|
||||
/**
|
||||
* Get value associated with a specific key.
|
||||
* @param key lookup key
|
||||
* @return associated value
|
||||
*/
|
||||
public abstract Object get(CharSequence key);
|
||||
|
||||
/**
|
||||
* Persist the constructed lookup data to a directory. Optional operation.
|
||||
|
@ -173,4 +166,5 @@ public abstract class Lookup {
|
|||
* @throws IOException when fatal IO error occurs.
|
||||
*/
|
||||
public abstract boolean load(File storeDir) throws IOException;
|
||||
|
||||
}
|
||||
|
|
|
@ -17,45 +17,166 @@ package org.apache.lucene.search.suggest;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.search.suggest.fst.Sort;
|
||||
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
|
||||
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* This wrapper buffers incoming elements and makes sure they are sorted in
|
||||
* ascending lexicographic order.
|
||||
* This wrapper buffers incoming elements and makes sure they are sorted based on given comparator.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper {
|
||||
// TODO keep this for now - but the consumer should really sort this stuff on disk with sorter...
|
||||
private final int[] sortedOrds;
|
||||
private int currentOrd = -1;
|
||||
private final BytesRef spare = new BytesRef();
|
||||
private final Comparator<BytesRef> comp;
|
||||
public class SortedTermFreqIteratorWrapper implements TermFreqIterator {
|
||||
|
||||
public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comp) throws IOException {
|
||||
super(source);
|
||||
this.sortedOrds = entries.sort(comp);
|
||||
this.comp = comp;
|
||||
private final TermFreqIterator source;
|
||||
private File tempInput;
|
||||
private File tempSorted;
|
||||
private final ByteSequencesReader reader;
|
||||
private boolean done = false;
|
||||
|
||||
private long weight;
|
||||
private final BytesRef scratch = new BytesRef();
|
||||
private final Comparator<BytesRef> comparator;
|
||||
|
||||
public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator) throws IOException {
|
||||
this(source, comparator, false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long weight() {
|
||||
return freqs[currentOrd];
|
||||
public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator, boolean compareRawBytes) throws IOException {
|
||||
this.source = source;
|
||||
this.comparator = comparator;
|
||||
this.reader = sort(compareRawBytes ? comparator : new BytesOnlyComparator(this.comparator));
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
if (++curPos < entries.size()) {
|
||||
return entries.get(spare, (currentOrd = sortedOrds[curPos]));
|
||||
boolean success = false;
|
||||
if (done) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
ByteArrayDataInput input = new ByteArrayDataInput();
|
||||
if (reader.read(scratch)) {
|
||||
weight = decode(scratch, input);
|
||||
success = true;
|
||||
return scratch;
|
||||
}
|
||||
close();
|
||||
success = done = true;
|
||||
return null;
|
||||
} finally {
|
||||
if (!success) {
|
||||
done = true;
|
||||
close();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return comp;
|
||||
return comparator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long weight() {
|
||||
return weight;
|
||||
}
|
||||
|
||||
private Sort.ByteSequencesReader sort(Comparator<BytesRef> comparator) throws IOException {
|
||||
String prefix = getClass().getSimpleName();
|
||||
File directory = Sort.defaultTempDir();
|
||||
tempInput = File.createTempFile(prefix, ".input", directory);
|
||||
tempSorted = File.createTempFile(prefix, ".sorted", directory);
|
||||
|
||||
final Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
|
||||
boolean success = false;
|
||||
try {
|
||||
BytesRef spare;
|
||||
byte[] buffer = new byte[0];
|
||||
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
|
||||
|
||||
while ((spare = source.next()) != null) {
|
||||
encode(writer, output, buffer, spare, source.weight());
|
||||
}
|
||||
writer.close();
|
||||
new Sort(comparator).sort(tempInput, tempSorted);
|
||||
ByteSequencesReader reader = new Sort.ByteSequencesReader(tempSorted);
|
||||
success = true;
|
||||
return reader;
|
||||
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(writer);
|
||||
} else {
|
||||
try {
|
||||
IOUtils.closeWhileHandlingException(writer);
|
||||
} finally {
|
||||
close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private void close() throws IOException {
|
||||
if (tempInput != null) {
|
||||
tempInput.delete();
|
||||
}
|
||||
if (tempSorted != null) {
|
||||
tempSorted.delete();
|
||||
}
|
||||
IOUtils.close(reader);
|
||||
}
|
||||
|
||||
private final static class BytesOnlyComparator implements Comparator<BytesRef> {
|
||||
|
||||
final Comparator<BytesRef> other;
|
||||
private final BytesRef leftScratch = new BytesRef();
|
||||
private final BytesRef rightScratch = new BytesRef();
|
||||
|
||||
public BytesOnlyComparator(Comparator<BytesRef> other) {
|
||||
this.other = other;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(BytesRef left, BytesRef right) {
|
||||
wrap(leftScratch, left);
|
||||
wrap(rightScratch, right);
|
||||
return other.compare(leftScratch, rightScratch);
|
||||
}
|
||||
|
||||
private void wrap(BytesRef wrapper, BytesRef source) {
|
||||
wrapper.bytes = source.bytes;
|
||||
wrapper.offset = source.offset;
|
||||
wrapper.length = source.length - 8;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException {
|
||||
if (spare.length + 8 >= buffer.length) {
|
||||
buffer = ArrayUtil.grow(buffer, spare.length + 8);
|
||||
}
|
||||
output.reset(buffer);
|
||||
output.writeBytes(spare.bytes, spare.offset, spare.length);
|
||||
output.writeLong(weight);
|
||||
writer.write(buffer, 0, output.getPosition());
|
||||
}
|
||||
|
||||
protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
|
||||
tmpInput.reset(scratch.bytes);
|
||||
tmpInput.skipBytes(scratch.length - 8); // suggestion + separator
|
||||
scratch.length -= 8; // sep + long
|
||||
return tmpInput.readLong();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
/**
|
||||
* This wrapper buffers the incoming elements and makes sure they are in
|
||||
* random order.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper {
|
||||
// TODO keep this for now
|
||||
|
|
|
@ -18,13 +18,16 @@ package org.apache.lucene.search.suggest.fst;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefIterator;
|
||||
|
||||
/**
|
||||
* Collects {@link BytesRef} and then allows one to iterate over their sorted order. Implementations
|
||||
* of this interface will be called in a single-threaded scenario.
|
||||
* @lucene.experimental
|
||||
* @lucene.internal
|
||||
*/
|
||||
public interface BytesRefSorter {
|
||||
/**
|
||||
|
@ -42,5 +45,7 @@ public interface BytesRefSorter {
|
|||
*
|
||||
* @throws IOException If an I/O exception occurs.
|
||||
*/
|
||||
Iterator<BytesRef> iterator() throws IOException;
|
||||
BytesRefIterator iterator() throws IOException;
|
||||
|
||||
Comparator<BytesRef> getComparator();
|
||||
}
|
||||
|
|
|
@ -18,14 +18,17 @@ package org.apache.lucene.search.suggest.fst;
|
|||
*/
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefIterator;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Builds and iterates over sequences stored on disk.
|
||||
* @lucene.experimental
|
||||
* @lucene.internal
|
||||
*/
|
||||
public class ExternalRefSorter implements BytesRefSorter, Closeable {
|
||||
private final Sort sort;
|
||||
|
@ -38,30 +41,31 @@ public class ExternalRefSorter implements BytesRefSorter, Closeable {
|
|||
*/
|
||||
public ExternalRefSorter(Sort sort) throws IOException {
|
||||
this.sort = sort;
|
||||
this.input = File.createTempFile("RefSorter-", ".raw", Sort.defaultTempDir());
|
||||
this.input = File.createTempFile("RefSorter-", ".raw",
|
||||
Sort.defaultTempDir());
|
||||
this.writer = new Sort.ByteSequencesWriter(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(BytesRef utf8) throws IOException {
|
||||
if (writer == null)
|
||||
throw new IllegalStateException();
|
||||
if (writer == null) throw new IllegalStateException();
|
||||
writer.write(utf8);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<BytesRef> iterator() throws IOException {
|
||||
public BytesRefIterator iterator() throws IOException {
|
||||
if (sorted == null) {
|
||||
closeWriter();
|
||||
|
||||
sorted = File.createTempFile("RefSorter-", ".sorted", Sort.defaultTempDir());
|
||||
sorted = File.createTempFile("RefSorter-", ".sorted",
|
||||
Sort.defaultTempDir());
|
||||
sort.sort(input, sorted);
|
||||
|
||||
input.delete();
|
||||
input = null;
|
||||
}
|
||||
|
||||
return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted));
|
||||
return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted),
|
||||
sort.getComparator());
|
||||
}
|
||||
|
||||
private void closeWriter() throws IOException {
|
||||
|
@ -87,36 +91,50 @@ public class ExternalRefSorter implements BytesRefSorter, Closeable {
|
|||
/**
|
||||
* Iterate over byte refs in a file.
|
||||
*/
|
||||
class ByteSequenceIterator implements Iterator<BytesRef> {
|
||||
private ByteSequencesReader reader;
|
||||
private byte[] next;
|
||||
class ByteSequenceIterator implements BytesRefIterator {
|
||||
private final ByteSequencesReader reader;
|
||||
private BytesRef scratch = new BytesRef();
|
||||
private final Comparator<BytesRef> comparator;
|
||||
|
||||
public ByteSequenceIterator(ByteSequencesReader reader) throws IOException {
|
||||
public ByteSequenceIterator(ByteSequencesReader reader,
|
||||
Comparator<BytesRef> comparator) {
|
||||
this.reader = reader;
|
||||
this.next = reader.read();
|
||||
this.comparator = comparator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return next != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() {
|
||||
if (next == null) throw new NoSuchElementException();
|
||||
BytesRef r = new BytesRef(next);
|
||||
try {
|
||||
next = reader.read();
|
||||
if (next == null) {
|
||||
reader.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
public BytesRef next() throws IOException {
|
||||
if (scratch == null) {
|
||||
return null;
|
||||
}
|
||||
boolean success = false;
|
||||
try {
|
||||
byte[] next = reader.read();
|
||||
if (next != null) {
|
||||
scratch.bytes = next;
|
||||
scratch.length = next.length;
|
||||
scratch.offset = 0;
|
||||
} else {
|
||||
IOUtils.close(reader);
|
||||
scratch = null;
|
||||
}
|
||||
success = true;
|
||||
return scratch;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(reader);
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() { throw new UnsupportedOperationException(); }
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return comparator;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return sort.getComparator();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.util.fst.FST.Arc;
|
|||
* Finite state automata based implementation of "autocomplete" functionality.
|
||||
*
|
||||
* @see FSTCompletionBuilder
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
// TODO: we could store exact weights as outputs from the FST (int4 encoded
|
||||
|
@ -159,10 +160,10 @@ public class FSTCompletion {
|
|||
* @param utf8
|
||||
* The sequence of utf8 bytes to follow.
|
||||
*
|
||||
* @return Returns the bucket number of the match or <code>null</code> if no
|
||||
* @return Returns the bucket number of the match or <code>-1</code> if no
|
||||
* match was found.
|
||||
*/
|
||||
private Integer getExactMatchStartingFromRootArc(
|
||||
private int getExactMatchStartingFromRootArc(
|
||||
int rootArcIndex, BytesRef utf8) {
|
||||
// Get the UTF-8 bytes representation of the input key.
|
||||
try {
|
||||
|
@ -186,7 +187,7 @@ public class FSTCompletion {
|
|||
}
|
||||
|
||||
// No match.
|
||||
return null;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -273,8 +274,8 @@ public class FSTCompletion {
|
|||
// exact match, if requested.
|
||||
if (exactFirst) {
|
||||
if (!checkExistingAndReorder(res, key)) {
|
||||
Integer exactMatchBucket = getExactMatchStartingFromRootArc(i, key);
|
||||
if (exactMatchBucket != null) {
|
||||
int exactMatchBucket = getExactMatchStartingFromRootArc(i, key);
|
||||
if (exactMatchBucket != -1) {
|
||||
// Insert as the first result and truncate at num.
|
||||
while (res.size() >= num) {
|
||||
res.remove(res.size() - 1);
|
||||
|
@ -385,10 +386,10 @@ public class FSTCompletion {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns the bucket assigned to a given key (if found) or <code>null</code> if
|
||||
* Returns the bucket assigned to a given key (if found) or <code>-1</code> if
|
||||
* no exact match exists.
|
||||
*/
|
||||
public Integer getBucket(CharSequence key) {
|
||||
public int getBucket(CharSequence key) {
|
||||
return getExactMatchStartingFromRootArc(0, new BytesRef(key));
|
||||
}
|
||||
|
||||
|
|
|
@ -19,9 +19,9 @@ package org.apache.lucene.search.suggest.fst;
|
|||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefIterator;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.fst.*;
|
||||
|
||||
|
@ -98,6 +98,7 @@ import org.apache.lucene.util.fst.*;
|
|||
* change, requiring you to rebuild the FST suggest index.
|
||||
*
|
||||
* @see FSTCompletion
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class FSTCompletionBuilder {
|
||||
/**
|
||||
|
@ -143,10 +144,11 @@ public class FSTCompletionBuilder {
|
|||
|
||||
/**
|
||||
* Creates an {@link FSTCompletion} with default options: 10 buckets, exact match
|
||||
* promoted to first position and {@link InMemorySorter}.
|
||||
* promoted to first position and {@link InMemorySorter} with a comparator obtained from
|
||||
* {@link BytesRef#getUTF8SortedAsUnicodeComparator()}.
|
||||
*/
|
||||
public FSTCompletionBuilder() {
|
||||
this(DEFAULT_BUCKETS, new InMemorySorter(), Integer.MAX_VALUE);
|
||||
this(DEFAULT_BUCKETS, new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator()), Integer.MAX_VALUE);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -237,10 +239,12 @@ public class FSTCompletionBuilder {
|
|||
shareMaxTailLength, outputs, null, false);
|
||||
|
||||
BytesRef scratch = new BytesRef();
|
||||
BytesRef entry;
|
||||
final IntsRef scratchIntsRef = new IntsRef();
|
||||
int count = 0;
|
||||
for (Iterator<BytesRef> i = sorter.iterator(); i.hasNext(); count++) {
|
||||
BytesRef entry = i.next();
|
||||
BytesRefIterator iter = sorter.iterator();
|
||||
while((entry = iter.next()) != null) {
|
||||
count++;
|
||||
if (scratch.compareTo(entry) != 0) {
|
||||
builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
|
||||
scratch.copyBytes(entry);
|
||||
|
|
|
@ -59,6 +59,7 @@ import org.apache.lucene.util.fst.NoOutputs;
|
|||
* use {@link FSTCompletion} directly or {@link TSTLookup}, for example.
|
||||
*
|
||||
* @see FSTCompletion
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class FSTCompletionLookup extends Lookup {
|
||||
/**
|
||||
|
@ -171,7 +172,7 @@ public class FSTCompletionLookup extends Lookup {
|
|||
}
|
||||
|
||||
output.reset(buffer);
|
||||
output.writeInt(FloatMagic.toSortable(tfit.weight()));
|
||||
output.writeInt(encodeWeight(tfit.weight()));
|
||||
output.writeBytes(spare.bytes, spare.offset, spare.length);
|
||||
writer.write(buffer, 0, output.getPosition());
|
||||
}
|
||||
|
@ -188,13 +189,13 @@ public class FSTCompletionLookup extends Lookup {
|
|||
reader = new Sort.ByteSequencesReader(tempSorted);
|
||||
long line = 0;
|
||||
int previousBucket = 0;
|
||||
float previousScore = 0;
|
||||
int previousScore = 0;
|
||||
ByteArrayDataInput input = new ByteArrayDataInput();
|
||||
BytesRef tmp1 = new BytesRef();
|
||||
BytesRef tmp2 = new BytesRef();
|
||||
while (reader.read(tmp1)) {
|
||||
input.reset(tmp1.bytes);
|
||||
float currentScore = FloatMagic.fromSortable(input.readInt());
|
||||
int currentScore = input.readInt();
|
||||
|
||||
int bucket;
|
||||
if (line > 0 && currentScore == previousScore) {
|
||||
|
@ -231,6 +232,14 @@ public class FSTCompletionLookup extends Lookup {
|
|||
}
|
||||
}
|
||||
|
||||
/** weight -> cost */
|
||||
private static int encodeWeight(long value) {
|
||||
if (value < Integer.MIN_VALUE || value > Integer.MAX_VALUE) {
|
||||
throw new UnsupportedOperationException("cannot encode value: " + value);
|
||||
}
|
||||
return (int)value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<LookupResult> lookup(CharSequence key, boolean higherWeightsFirst, int num) {
|
||||
final List<Completion> completions;
|
||||
|
@ -250,19 +259,9 @@ public class FSTCompletionLookup extends Lookup {
|
|||
return results;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(CharSequence key, Object value) {
|
||||
// Not supported.
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object get(CharSequence key) {
|
||||
Integer bucket = normalCompletion.getBucket(key);
|
||||
if (bucket == null)
|
||||
return null;
|
||||
else
|
||||
return (float) normalCompletion.getBucket(key) / normalCompletion.getBucketCount();
|
||||
final int bucket = normalCompletion.getBucket(key);
|
||||
return bucket == -1 ? null : Long.valueOf(bucket);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -1,75 +0,0 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.NumericUtils;
|
||||
|
||||
/**
|
||||
* Converts normalized float representations ({@link Float#floatToIntBits(float)})
|
||||
* into integers that are directly sortable in int4 representation (or unsigned values or
|
||||
* after promoting to a long with higher 32-bits zeroed).
|
||||
*/
|
||||
class FloatMagic {
|
||||
/**
|
||||
* Convert a float to a directly sortable unsigned integer. For sortable signed
|
||||
* integers, see {@link NumericUtils#floatToSortableInt(float)}.
|
||||
*/
|
||||
public static int toSortable(float f) {
|
||||
return floatBitsToUnsignedOrdered(Float.floatToRawIntBits(f));
|
||||
}
|
||||
|
||||
/**
|
||||
* Back from {@link #toSortable(float)} to float.
|
||||
*/
|
||||
public static float fromSortable(int v) {
|
||||
return Float.intBitsToFloat(unsignedOrderedToFloatBits(v));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert float bits to directly sortable bits.
|
||||
* Normalizes all NaNs to canonical form.
|
||||
*/
|
||||
static int floatBitsToUnsignedOrdered(int v) {
|
||||
// Canonicalize NaN ranges. I assume this check will be faster here than
|
||||
// (v == v) == false on the FPU? We don't distinguish between different
|
||||
// flavors of NaNs here (see http://en.wikipedia.org/wiki/NaN). I guess
|
||||
// in Java this doesn't matter much anyway.
|
||||
if ((v & 0x7fffffff) > 0x7f800000) {
|
||||
// Apply the logic below to a canonical "quiet NaN"
|
||||
return 0x7fc00000 ^ 0x80000000;
|
||||
}
|
||||
|
||||
if (v < 0) {
|
||||
// Reverse the order of negative values and push them before positive values.
|
||||
return ~v;
|
||||
} else {
|
||||
// Shift positive values after negative, but before NaNs, they're sorted already.
|
||||
return v ^ 0x80000000;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Back from {@link #floatBitsToUnsignedOrdered(int)}.
|
||||
*/
|
||||
static int unsignedOrderedToFloatBits(int v) {
|
||||
if (v < 0)
|
||||
return v & ~0x80000000;
|
||||
else
|
||||
return ~v;
|
||||
}
|
||||
}
|
|
@ -17,29 +17,40 @@ package org.apache.lucene.search.suggest.fst;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.search.suggest.BytesRefList;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefIterator;
|
||||
|
||||
/**
|
||||
* An {@link BytesRefSorter} that keeps all the entries in memory.
|
||||
* @lucene.experimental
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class InMemorySorter implements BytesRefSorter {
|
||||
// TODO: use a single byte[] to back up all entries?
|
||||
private final ArrayList<BytesRef> refs = new ArrayList<BytesRef>();
|
||||
|
||||
private final BytesRefList buffer = new BytesRefList();
|
||||
private boolean closed = false;
|
||||
private final Comparator<BytesRef> comparator;
|
||||
|
||||
public InMemorySorter(Comparator<BytesRef> comparator) {
|
||||
this.comparator = comparator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(BytesRef utf8) {
|
||||
if (closed) throw new IllegalStateException();
|
||||
refs.add(BytesRef.deepCopyOf(utf8));
|
||||
buffer.append(utf8);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<BytesRef> iterator() {
|
||||
public BytesRefIterator iterator() {
|
||||
closed = true;
|
||||
Collections.sort(refs, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
return Collections.unmodifiableCollection(refs).iterator();
|
||||
return buffer.iterator(comparator);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return comparator;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,15 +20,10 @@ package org.apache.lucene.search.suggest.fst;
|
|||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.search.suggest.BytesRefList;
|
||||
import org.apache.lucene.util.*;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
// TODO: the buffer is currently byte[][] which with very small arrays will terribly overallocate
|
||||
// memory (alignments) and make GC very happy.
|
||||
//
|
||||
// We could move it to a single byte[] + and use custom sorting, but we'd need to check if this
|
||||
// yields any improvement first.
|
||||
|
||||
/**
|
||||
* On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following
|
||||
* fields:
|
||||
|
@ -38,6 +33,8 @@ import org.apache.lucene.util.PriorityQueue;
|
|||
* </ul>
|
||||
*
|
||||
* @see #sort(File, File)
|
||||
* @lucene.experimental
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class Sort {
|
||||
public final static int MB = 1024 * 1024;
|
||||
|
@ -59,11 +56,6 @@ public final class Sort {
|
|||
*/
|
||||
public final static int MAX_TEMPFILES = 128;
|
||||
|
||||
/**
|
||||
* Minimum slot buffer expansion.
|
||||
*/
|
||||
private final static int MIN_EXPECTED_GROWTH = 1000;
|
||||
|
||||
/**
|
||||
* A bit more descriptive unit for constructors.
|
||||
*
|
||||
|
@ -111,21 +103,6 @@ public final class Sort {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* byte[] in unsigned byte order.
|
||||
*/
|
||||
static final Comparator<byte[]> unsignedByteOrderComparator = new Comparator<byte[]>() {
|
||||
public int compare(byte[] left, byte[] right) {
|
||||
final int max = Math.min(left.length, right.length);
|
||||
for (int i = 0, j = 0; i < max; i++, j++) {
|
||||
int diff = (left[i] & 0xff) - (right[j] & 0xff);
|
||||
if (diff != 0)
|
||||
return diff;
|
||||
}
|
||||
return left.length - right.length;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Sort info (debugging mostly).
|
||||
*/
|
||||
|
@ -149,14 +126,15 @@ public final class Sort {
|
|||
}
|
||||
}
|
||||
|
||||
private final static byte [][] EMPTY = new byte [0][];
|
||||
|
||||
private final BufferSize ramBufferSize;
|
||||
private final File tempDirectory;
|
||||
|
||||
private byte [][] buffer = new byte [0][];
|
||||
private final BytesRefList buffer = new BytesRefList();
|
||||
private SortInfo sortInfo;
|
||||
private int maxTempFiles;
|
||||
private final Comparator<BytesRef> comparator;
|
||||
|
||||
public static final Comparator<BytesRef> DEFAULT_COMPARATOR = BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
|
||||
/**
|
||||
* Defaults constructor.
|
||||
|
@ -165,13 +143,17 @@ public final class Sort {
|
|||
* @see BufferSize#automatic()
|
||||
*/
|
||||
public Sort() throws IOException {
|
||||
this(BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
|
||||
this(DEFAULT_COMPARATOR, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
|
||||
}
|
||||
|
||||
public Sort(Comparator<BytesRef> comparator) throws IOException {
|
||||
this(comparator, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
|
||||
}
|
||||
|
||||
/**
|
||||
* All-details constructor.
|
||||
*/
|
||||
public Sort(BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) {
|
||||
public Sort(Comparator<BytesRef> comparator, BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) {
|
||||
if (ramBufferSize.bytes < ABSOLUTE_MIN_SORT_BUFFER_SIZE) {
|
||||
throw new IllegalArgumentException(MIN_BUFFER_SIZE_MSG + ": " + ramBufferSize.bytes);
|
||||
}
|
||||
|
@ -183,6 +165,7 @@ public final class Sort {
|
|||
this.ramBufferSize = ramBufferSize;
|
||||
this.tempDirectory = tempDirectory;
|
||||
this.maxTempFiles = maxTempfiles;
|
||||
this.comparator = comparator;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -283,23 +266,25 @@ public final class Sort {
|
|||
|
||||
/** Sort a single partition in-memory. */
|
||||
protected File sortPartition(int len) throws IOException {
|
||||
byte [][] data = this.buffer;
|
||||
BytesRefList data = this.buffer;
|
||||
File tempFile = File.createTempFile("sort", "partition", tempDirectory);
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
Arrays.sort(data, 0, len, unsignedByteOrderComparator);
|
||||
sortInfo.sortTime += (System.currentTimeMillis() - start);
|
||||
|
||||
ByteSequencesWriter out = new ByteSequencesWriter(tempFile);
|
||||
final ByteSequencesWriter out = new ByteSequencesWriter(tempFile);
|
||||
BytesRef spare;
|
||||
try {
|
||||
for (int i = 0; i < len; i++) {
|
||||
assert data[i].length <= Short.MAX_VALUE;
|
||||
out.write(data[i]);
|
||||
BytesRefIterator iter = buffer.iterator(comparator);
|
||||
while((spare = iter.next()) != null) {
|
||||
assert spare.length <= Short.MAX_VALUE;
|
||||
out.write(spare);
|
||||
}
|
||||
|
||||
out.close();
|
||||
|
||||
// Clean up the buffer for the next partition.
|
||||
this.buffer = EMPTY;
|
||||
data.clear();
|
||||
return tempFile;
|
||||
} finally {
|
||||
IOUtils.close(out);
|
||||
|
@ -314,7 +299,7 @@ public final class Sort {
|
|||
|
||||
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(merges.size()) {
|
||||
protected boolean lessThan(FileAndTop a, FileAndTop b) {
|
||||
return a.current.compareTo(b.current) < 0;
|
||||
return comparator.compare(a.current, b.current) < 0;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -359,33 +344,18 @@ public final class Sort {
|
|||
/** Read in a single partition of data */
|
||||
int readPartition(ByteSequencesReader reader) throws IOException {
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
// We will be reallocating from scratch.
|
||||
Arrays.fill(this.buffer, null);
|
||||
|
||||
int bytesLimit = this.ramBufferSize.bytes;
|
||||
byte [][] data = this.buffer;
|
||||
byte[] line;
|
||||
int linesRead = 0;
|
||||
while ((line = reader.read()) != null) {
|
||||
if (linesRead + 1 >= data.length) {
|
||||
data = Arrays.copyOf(data,
|
||||
ArrayUtil.oversize(linesRead + MIN_EXPECTED_GROWTH,
|
||||
RamUsageEstimator.NUM_BYTES_OBJECT_REF));
|
||||
}
|
||||
data[linesRead++] = line;
|
||||
|
||||
final BytesRef scratch = new BytesRef();
|
||||
while ((scratch.bytes = reader.read()) != null) {
|
||||
scratch.length = scratch.bytes.length;
|
||||
buffer.append(scratch);
|
||||
// Account for the created objects.
|
||||
// (buffer slots do not account to buffer size.)
|
||||
bytesLimit -= line.length + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER;
|
||||
if (bytesLimit < 0) {
|
||||
if (ramBufferSize.bytes < buffer.bytesUsed()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
this.buffer = data;
|
||||
|
||||
sortInfo.readTime += (System.currentTimeMillis() - start);
|
||||
return linesRead;
|
||||
return buffer.size();
|
||||
}
|
||||
|
||||
static class FileAndTop {
|
||||
|
@ -516,4 +486,8 @@ public final class Sort {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return comparator;
|
||||
}
|
||||
}
|
|
@ -28,6 +28,8 @@ import java.util.List;
|
|||
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
|
||||
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.InputStreamDataInput;
|
||||
|
@ -102,72 +104,27 @@ public class WFSTCompletionLookup extends Lookup {
|
|||
|
||||
@Override
|
||||
public void build(TermFreqIterator iterator) throws IOException {
|
||||
String prefix = getClass().getSimpleName();
|
||||
File directory = Sort.defaultTempDir();
|
||||
File tempInput = File.createTempFile(prefix, ".input", directory);
|
||||
File tempSorted = File.createTempFile(prefix, ".sorted", directory);
|
||||
|
||||
Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
|
||||
Sort.ByteSequencesReader reader = null;
|
||||
BytesRef scratch = new BytesRef();
|
||||
TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator,
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
IntsRef scratchInts = new IntsRef();
|
||||
BytesRef previous = null;
|
||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
while ((scratch = iter.next()) != null) {
|
||||
long cost = iter.weight();
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
byte [] buffer = new byte [0];
|
||||
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
|
||||
BytesRef spare;
|
||||
while ((spare = iterator.next()) != null) {
|
||||
if (spare.length + 5 >= buffer.length) {
|
||||
buffer = ArrayUtil.grow(buffer, spare.length + 5);
|
||||
}
|
||||
|
||||
output.reset(buffer);
|
||||
output.writeBytes(spare.bytes, spare.offset, spare.length);
|
||||
output.writeByte((byte)0); // separator: not used, just for sort order
|
||||
output.writeInt((int)encodeWeight(iterator.weight()));
|
||||
writer.write(buffer, 0, output.getPosition());
|
||||
if (previous == null) {
|
||||
previous = new BytesRef();
|
||||
} else if (scratch.equals(previous)) {
|
||||
continue; // for duplicate suggestions, the best weight is actually
|
||||
// added
|
||||
}
|
||||
writer.close();
|
||||
new Sort().sort(tempInput, tempSorted);
|
||||
reader = new Sort.ByteSequencesReader(tempSorted);
|
||||
|
||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
|
||||
BytesRef previous = null;
|
||||
BytesRef suggestion = new BytesRef();
|
||||
IntsRef scratchInts = new IntsRef();
|
||||
ByteArrayDataInput input = new ByteArrayDataInput();
|
||||
while (reader.read(scratch)) {
|
||||
suggestion.bytes = scratch.bytes;
|
||||
suggestion.offset = scratch.offset;
|
||||
suggestion.length = scratch.length - 5; // int + separator
|
||||
|
||||
input.reset(scratch.bytes);
|
||||
input.skipBytes(suggestion.length + 1); // suggestion + separator
|
||||
long cost = input.readInt();
|
||||
|
||||
if (previous == null) {
|
||||
previous = new BytesRef();
|
||||
} else if (suggestion.equals(previous)) {
|
||||
continue; // for duplicate suggestions, the best weight is actually added
|
||||
}
|
||||
Util.toIntsRef(suggestion, scratchInts);
|
||||
builder.add(scratchInts, cost);
|
||||
previous.copyBytes(suggestion);
|
||||
}
|
||||
fst = builder.finish();
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(reader, writer);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(reader, writer);
|
||||
}
|
||||
|
||||
tempInput.delete();
|
||||
tempSorted.delete();
|
||||
Util.toIntsRef(scratch, scratchInts);
|
||||
builder.add(scratchInts, cost);
|
||||
previous.copyBytes(scratch);
|
||||
}
|
||||
fst = builder.finish();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -270,16 +227,10 @@ public class WFSTCompletionLookup extends Lookup {
|
|||
return output;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(CharSequence key, Object value) {
|
||||
return false; // Not supported.
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the weight associated with an input string,
|
||||
* or null if it does not exist.
|
||||
*/
|
||||
@Override
|
||||
public Object get(CharSequence key) {
|
||||
Arc<Long> arc = new Arc<Long>();
|
||||
Long result = null;
|
||||
|
@ -289,23 +240,51 @@ public class WFSTCompletionLookup extends Lookup {
|
|||
if (result == null || !arc.isFinal()) {
|
||||
return null;
|
||||
} else {
|
||||
return decodeWeight(result + arc.nextFinalOutput);
|
||||
return Integer.valueOf(decodeWeight(result + arc.nextFinalOutput));
|
||||
}
|
||||
}
|
||||
|
||||
/** cost -> weight */
|
||||
private static float decodeWeight(long encoded) {
|
||||
return Integer.MAX_VALUE - encoded;
|
||||
private static int decodeWeight(long encoded) {
|
||||
return (int)(Integer.MAX_VALUE - encoded);
|
||||
}
|
||||
|
||||
/** weight -> cost */
|
||||
private static long encodeWeight(float value) {
|
||||
if (Float.isNaN(value) || Float.isInfinite(value) || value < 0 || value > Integer.MAX_VALUE) {
|
||||
private static int encodeWeight(long value) {
|
||||
if (value < 0 || value > Integer.MAX_VALUE) {
|
||||
throw new UnsupportedOperationException("cannot encode value: " + value);
|
||||
}
|
||||
return Integer.MAX_VALUE - (int)value;
|
||||
}
|
||||
|
||||
private final class WFSTTermFreqIteratorWrapper extends SortedTermFreqIteratorWrapper {
|
||||
|
||||
WFSTTermFreqIteratorWrapper(TermFreqIterator source,
|
||||
Comparator<BytesRef> comparator) throws IOException {
|
||||
super(source, comparator, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException {
|
||||
if (spare.length + 5 >= buffer.length) {
|
||||
buffer = ArrayUtil.grow(buffer, spare.length + 5);
|
||||
}
|
||||
output.reset(buffer);
|
||||
output.writeBytes(spare.bytes, spare.offset, spare.length);
|
||||
output.writeByte((byte)0); // separator: not used, just for sort order
|
||||
output.writeInt(encodeWeight(weight));
|
||||
writer.write(buffer, 0, output.getPosition());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
|
||||
tmpInput.reset(scratch.bytes);
|
||||
tmpInput.skipBytes(scratch.length - 4); // suggestion + separator
|
||||
scratch.length -= 5; // sep + long
|
||||
return tmpInput.readInt();
|
||||
}
|
||||
}
|
||||
|
||||
static final Comparator<Long> weightComparator = new Comparator<Long> () {
|
||||
public int compare(Long left, Long right) {
|
||||
return left.compareTo(right);
|
||||
|
|
|
@ -55,24 +55,22 @@ public class JaspellLookup extends Lookup {
|
|||
final CharsRef charsSpare = new CharsRef();
|
||||
|
||||
while ((spare = tfit.next()) != null) {
|
||||
float freq = tfit.weight();
|
||||
final long weight = tfit.weight();
|
||||
if (spare.length == 0) {
|
||||
continue;
|
||||
}
|
||||
charsSpare.grow(spare.length);
|
||||
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
|
||||
trie.put(charsSpare.toString(), new Float(freq));
|
||||
trie.put(charsSpare.toString(), Long.valueOf(weight));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(CharSequence key, Object value) {
|
||||
trie.put(key, value);
|
||||
// XXX
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object get(CharSequence key) {
|
||||
return trie.get(key);
|
||||
}
|
||||
|
@ -95,7 +93,7 @@ public class JaspellLookup extends Lookup {
|
|||
if (onlyMorePopular) {
|
||||
LookupPriorityQueue queue = new LookupPriorityQueue(num);
|
||||
for (String s : list) {
|
||||
float freq = (Float)trie.get(s);
|
||||
long freq = ((Number)trie.get(s)).longValue();
|
||||
queue.insertWithOverflow(new LookupResult(new CharsRef(s), freq));
|
||||
}
|
||||
for (LookupResult lr : queue.getResults()) {
|
||||
|
@ -104,7 +102,7 @@ public class JaspellLookup extends Lookup {
|
|||
} else {
|
||||
for (int i = 0; i < maxCnt; i++) {
|
||||
String s = list.get(i);
|
||||
float freq = (Float)trie.get(s);
|
||||
long freq = ((Number)trie.get(s)).longValue();
|
||||
res.add(new LookupResult(new CharsRef(s), freq));
|
||||
}
|
||||
}
|
||||
|
@ -131,7 +129,7 @@ public class JaspellLookup extends Lookup {
|
|||
node.splitchar = in.readChar();
|
||||
byte mask = in.readByte();
|
||||
if ((mask & HAS_VALUE) != 0) {
|
||||
node.data = new Float(in.readFloat());
|
||||
node.data = Long.valueOf(in.readLong());
|
||||
}
|
||||
if ((mask & LO_KID) != 0) {
|
||||
TSTNode kid = trie.new TSTNode('\0', node);
|
||||
|
@ -171,7 +169,7 @@ public class JaspellLookup extends Lookup {
|
|||
if (node.data != null) mask |= HAS_VALUE;
|
||||
out.writeByte(mask);
|
||||
if (node.data != null) {
|
||||
out.writeFloat((Float)node.data);
|
||||
out.writeLong(((Number)node.data).longValue());
|
||||
}
|
||||
writeRecursively(out, node.relatives[TSTNode.LOKID]);
|
||||
writeRecursively(out, node.relatives[TSTNode.EQKID]);
|
||||
|
|
|
@ -50,26 +50,24 @@ public class TSTLookup extends Lookup {
|
|||
}
|
||||
|
||||
ArrayList<String> tokens = new ArrayList<String>();
|
||||
ArrayList<Float> vals = new ArrayList<Float>();
|
||||
ArrayList<Number> vals = new ArrayList<Number>();
|
||||
BytesRef spare;
|
||||
CharsRef charsSpare = new CharsRef();
|
||||
while ((spare = tfit.next()) != null) {
|
||||
charsSpare.grow(spare.length);
|
||||
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
|
||||
tokens.add(charsSpare.toString());
|
||||
vals.add(new Float(tfit.weight()));
|
||||
vals.add(Long.valueOf(tfit.weight()));
|
||||
}
|
||||
autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(CharSequence key, Object value) {
|
||||
autocomplete.insert(root, key, value, 0);
|
||||
// XXX we don't know if a new node was created
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object get(CharSequence key) {
|
||||
List<TernaryTreeNode> list = autocomplete.prefixCompletion(root, key, 0);
|
||||
if (list == null || list.isEmpty()) {
|
||||
|
@ -107,7 +105,7 @@ public class TSTLookup extends Lookup {
|
|||
if (onlyMorePopular) {
|
||||
LookupPriorityQueue queue = new LookupPriorityQueue(num);
|
||||
for (TernaryTreeNode ttn : list) {
|
||||
queue.insertWithOverflow(new LookupResult(ttn.token, (Float)ttn.val));
|
||||
queue.insertWithOverflow(new LookupResult(ttn.token, ((Number)ttn.val).longValue()));
|
||||
}
|
||||
for (LookupResult lr : queue.getResults()) {
|
||||
res.add(lr);
|
||||
|
@ -115,7 +113,7 @@ public class TSTLookup extends Lookup {
|
|||
} else {
|
||||
for (int i = 0; i < maxCnt; i++) {
|
||||
TernaryTreeNode ttn = list.get(i);
|
||||
res.add(new LookupResult(ttn.token, (Float)ttn.val));
|
||||
res.add(new LookupResult(ttn.token, ((Number)ttn.val).longValue()));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
|
@ -146,7 +144,7 @@ public class TSTLookup extends Lookup {
|
|||
node.token = in.readUTF();
|
||||
}
|
||||
if ((mask & HAS_VALUE) != 0) {
|
||||
node.val = new Float(in.readFloat());
|
||||
node.val = Long.valueOf(in.readLong());
|
||||
}
|
||||
if ((mask & LO_KID) != 0) {
|
||||
node.loKid = new TernaryTreeNode();
|
||||
|
@ -184,7 +182,7 @@ public class TSTLookup extends Lookup {
|
|||
if (node.val != null) mask |= HAS_VALUE;
|
||||
out.writeByte(mask);
|
||||
if (node.token != null) out.writeUTF(node.token);
|
||||
if (node.val != null) out.writeFloat((Float)node.val);
|
||||
if (node.val != null) out.writeLong(((Number)node.val).longValue());
|
||||
// recurse and write kids
|
||||
if (node.loKid != null) {
|
||||
writeRecursively(out, node.loKid);
|
||||
|
|
|
@ -17,8 +17,10 @@
|
|||
package org.apache.lucene.search.suggest;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
||||
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
|
||||
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
|
||||
import org.apache.lucene.search.suggest.tst.TSTLookup;
|
||||
|
@ -74,16 +76,18 @@ public class PersistenceTest extends LuceneTestCase {
|
|||
lookup.load(storeDir);
|
||||
|
||||
// Assert validity.
|
||||
float previous = Float.NEGATIVE_INFINITY;
|
||||
long previous = Long.MIN_VALUE;
|
||||
for (TermFreq k : keys) {
|
||||
Float val = (Float) lookup.get(_TestUtil.bytesToCharSequence(k.term, random));
|
||||
assertNotNull(k.term.utf8ToString(), val);
|
||||
List<LookupResult> list = lookup.lookup(_TestUtil.bytesToCharSequence(k.term, random), false, 1);
|
||||
assertEquals(1, list.size());
|
||||
LookupResult lookupResult = list.get(0);
|
||||
assertNotNull(k.term.utf8ToString(), lookupResult.key);
|
||||
|
||||
if (supportsExactWeights) {
|
||||
assertEquals(k.term.utf8ToString(), Float.valueOf(k.v), val);
|
||||
assertEquals(k.term.utf8ToString(), k.v, lookupResult.value);
|
||||
} else {
|
||||
assertTrue(val + ">=" + previous, val >= previous);
|
||||
previous = val.floatValue();
|
||||
assertTrue(lookupResult.value + ">=" + previous, lookupResult.value >= previous);
|
||||
previous = lookupResult.value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,55 +33,75 @@ public class TestBytesRefList extends LuceneTestCase {
|
|||
public void testAppend() throws IOException {
|
||||
BytesRefList list = new BytesRefList();
|
||||
List<String> stringList = new ArrayList<String>();
|
||||
int entries = atLeast(500);
|
||||
BytesRef spare = new BytesRef();
|
||||
for (int i = 0; i < entries; i++) {
|
||||
String randomRealisticUnicodeString = _TestUtil
|
||||
.randomRealisticUnicodeString(random);
|
||||
spare.copyChars(randomRealisticUnicodeString);
|
||||
list.append(spare);
|
||||
stringList.add(randomRealisticUnicodeString);
|
||||
}
|
||||
for (int i = 0; i < entries; i++) {
|
||||
assertNotNull(list.get(spare, i));
|
||||
assertEquals("entry " + i + " doesn't match", stringList.get(i),
|
||||
spare.utf8ToString());
|
||||
}
|
||||
for (int j = 0; j < 2; j++) {
|
||||
if (j > 0 && random.nextBoolean()) {
|
||||
list.clear();
|
||||
stringList.clear();
|
||||
}
|
||||
int entries = atLeast(500);
|
||||
BytesRef spare = new BytesRef();
|
||||
for (int i = 0; i < entries; i++) {
|
||||
String randomRealisticUnicodeString = _TestUtil
|
||||
.randomRealisticUnicodeString(random);
|
||||
spare.copyChars(randomRealisticUnicodeString);
|
||||
list.append(spare);
|
||||
stringList.add(randomRealisticUnicodeString);
|
||||
}
|
||||
for (int i = 0; i < entries; i++) {
|
||||
assertNotNull(list.get(spare, i));
|
||||
assertEquals("entry " + i + " doesn't match", stringList.get(i),
|
||||
spare.utf8ToString());
|
||||
}
|
||||
|
||||
// check random
|
||||
for (int i = 0; i < entries; i++) {
|
||||
int e = random.nextInt(entries);
|
||||
assertNotNull(list.get(spare, e));
|
||||
assertEquals("entry " + i + " doesn't match", stringList.get(e),
|
||||
spare.utf8ToString());
|
||||
}
|
||||
for (int i = 0; i < 2; i++) {
|
||||
// check random
|
||||
for (int i = 0; i < entries; i++) {
|
||||
int e = random.nextInt(entries);
|
||||
assertNotNull(list.get(spare, e));
|
||||
assertEquals("entry " + i + " doesn't match", stringList.get(e),
|
||||
spare.utf8ToString());
|
||||
}
|
||||
for (int i = 0; i < 2; i++) {
|
||||
|
||||
BytesRefIterator iterator = list.iterator();
|
||||
for (String string : stringList) {
|
||||
assertEquals(string, iterator.next().utf8ToString());
|
||||
BytesRefIterator iterator = list.iterator();
|
||||
for (String string : stringList) {
|
||||
assertEquals(string, iterator.next().utf8ToString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testSort() {
|
||||
public void testSort() throws IOException {
|
||||
BytesRefList list = new BytesRefList();
|
||||
List<String> stringList = new ArrayList<String>();
|
||||
int entries = atLeast(500);
|
||||
BytesRef spare = new BytesRef();
|
||||
for (int i = 0; i < entries; i++) {
|
||||
String randomRealisticUnicodeString = _TestUtil.randomRealisticUnicodeString(random);
|
||||
spare.copyChars(randomRealisticUnicodeString);
|
||||
list.append(spare);
|
||||
stringList.add(randomRealisticUnicodeString);
|
||||
}
|
||||
Collections.sort(stringList);
|
||||
int[] sortedOrds = list.sort(BytesRef.getUTF8SortedAsUTF16Comparator());
|
||||
for (int i = 0; i < entries; i++) {
|
||||
assertNotNull(list.get(spare, sortedOrds[i]));
|
||||
assertEquals("entry " + i + " doesn't match", stringList.get(i),
|
||||
spare.utf8ToString());
|
||||
|
||||
for (int j = 0; j < 2; j++) {
|
||||
if (j > 0 && random.nextBoolean()) {
|
||||
list.clear();
|
||||
stringList.clear();
|
||||
}
|
||||
int entries = atLeast(500);
|
||||
BytesRef spare = new BytesRef();
|
||||
for (int i = 0; i < entries; i++) {
|
||||
String randomRealisticUnicodeString = _TestUtil
|
||||
.randomRealisticUnicodeString(random);
|
||||
spare.copyChars(randomRealisticUnicodeString);
|
||||
list.append(spare);
|
||||
stringList.add(randomRealisticUnicodeString);
|
||||
}
|
||||
|
||||
Collections.sort(stringList);
|
||||
BytesRefIterator iter = list.iterator(BytesRef
|
||||
.getUTF8SortedAsUTF16Comparator());
|
||||
int i = 0;
|
||||
while ((spare = iter.next()) != null) {
|
||||
assertEquals("entry " + i + " doesn't match", stringList.get(i),
|
||||
spare.utf8ToString());
|
||||
i++;
|
||||
}
|
||||
assertNull(iter.next());
|
||||
assertEquals(i, stringList.size());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,12 +17,16 @@ package org.apache.lucene.search.suggest;
|
|||
* the License.
|
||||
*/
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
|
@ -38,7 +42,8 @@ public class TestTermFreqIterator extends LuceneTestCase {
|
|||
public void testTerms() throws Exception {
|
||||
int num = atLeast(10000);
|
||||
|
||||
TreeMap<BytesRef,Long> sorted = new TreeMap<BytesRef,Long>();
|
||||
Comparator<BytesRef> comparator = random.nextBoolean() ? BytesRef.getUTF8SortedAsUnicodeComparator() : BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
TreeMap<BytesRef,Long> sorted = new TreeMap<BytesRef,Long>(comparator);
|
||||
TermFreq[] unsorted = new TermFreq[num];
|
||||
|
||||
for (int i = 0; i < num; i++) {
|
||||
|
@ -52,13 +57,13 @@ public class TestTermFreqIterator extends LuceneTestCase {
|
|||
}
|
||||
|
||||
// test the sorted iterator wrapper
|
||||
TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator);
|
||||
Iterator<Map.Entry<BytesRef,Long>> expected = sorted.entrySet().iterator();
|
||||
while (expected.hasNext()) {
|
||||
Map.Entry<BytesRef,Long> entry = expected.next();
|
||||
|
||||
assertEquals(entry.getKey(), wrapper.next());
|
||||
assertEquals(entry.getValue().longValue(), wrapper.weight(), 0F);
|
||||
assertEquals(entry.getValue().longValue(), wrapper.weight());
|
||||
}
|
||||
assertNull(wrapper.next());
|
||||
|
||||
|
@ -72,4 +77,57 @@ public class TestTermFreqIterator extends LuceneTestCase {
|
|||
}
|
||||
assertEquals(sorted, actual);
|
||||
}
|
||||
|
||||
|
||||
public void testRaw() throws Exception {
|
||||
int num = atLeast(10000);
|
||||
|
||||
Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
BytesRefHash sorted = new BytesRefHash();
|
||||
TermFreq[] unsorted = new TermFreq[num];
|
||||
byte[] buffer = new byte[0];
|
||||
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
|
||||
|
||||
for (int i = 0; i < num; i++) {
|
||||
BytesRef spare;
|
||||
long weight;
|
||||
do {
|
||||
spare = new BytesRef(_TestUtil.randomUnicodeString(random));
|
||||
if (spare.length + 8 >= buffer.length) {
|
||||
buffer = ArrayUtil.grow(buffer, spare.length + 8);
|
||||
}
|
||||
output.reset(buffer);
|
||||
output.writeBytes(spare.bytes, spare.offset, spare.length);
|
||||
weight = random.nextLong();
|
||||
output.writeLong(weight);
|
||||
|
||||
} while (sorted.add(new BytesRef(buffer, 0, output.getPosition())) < 0);
|
||||
unsorted[i] = new TermFreq(spare, weight);
|
||||
}
|
||||
|
||||
// test the sorted iterator wrapper
|
||||
TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator, true);
|
||||
int[] sort = sorted.sort(comparator);
|
||||
int size = sorted.size();
|
||||
BytesRef spare = new BytesRef();
|
||||
for (int i = 0; i < size; i++) {
|
||||
sorted.get(sort[i], spare);
|
||||
spare.length -= 8; // sub the long value
|
||||
assertEquals(spare, wrapper.next());
|
||||
spare.offset = spare.offset + spare.length;
|
||||
spare.length = 8;
|
||||
assertEquals(asLong(spare), wrapper.weight());
|
||||
}
|
||||
assertNull(wrapper.next());
|
||||
}
|
||||
|
||||
public static long asLong(BytesRef b) {
|
||||
return (((long) asIntInternal(b, b.offset) << 32) | asIntInternal(b,
|
||||
b.offset + 4) & 0xFFFFFFFFL);
|
||||
}
|
||||
|
||||
private static int asIntInternal(BytesRef b, int pos) {
|
||||
return ((b.bytes[pos++] & 0xFF) << 24) | ((b.bytes[pos++] & 0xFF) << 16)
|
||||
| ((b.bytes[pos++] & 0xFF) << 8) | (b.bytes[pos] & 0xFF);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,9 +17,8 @@ package org.apache.lucene.search.suggest.fst;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefIterator;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -31,7 +30,7 @@ public class BytesRefSortersTest extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void testInMemorySorter() throws Exception {
|
||||
check(new InMemorySorter());
|
||||
check(new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator()));
|
||||
}
|
||||
|
||||
private void check(BytesRefSorter sorter) throws Exception {
|
||||
|
@ -42,8 +41,8 @@ public class BytesRefSortersTest extends LuceneTestCase {
|
|||
}
|
||||
|
||||
// Create two iterators and check that they're aligned with each other.
|
||||
Iterator<BytesRef> i1 = sorter.iterator();
|
||||
Iterator<BytesRef> i2 = sorter.iterator();
|
||||
BytesRefIterator i1 = sorter.iterator();
|
||||
BytesRefIterator i2 = sorter.iterator();
|
||||
|
||||
// Verify sorter contract.
|
||||
try {
|
||||
|
@ -52,10 +51,12 @@ public class BytesRefSortersTest extends LuceneTestCase {
|
|||
} catch (IllegalStateException e) {
|
||||
// Expected.
|
||||
}
|
||||
|
||||
while (i1.hasNext() && i2.hasNext()) {
|
||||
assertEquals(i1.next(), i2.next());
|
||||
BytesRef spare1;
|
||||
BytesRef spare2;
|
||||
while ((spare1 = i1.next()) != null && (spare2 = i2.next()) != null) {
|
||||
assertEquals(spare1, spare2);
|
||||
}
|
||||
assertEquals(i1.hasNext(), i2.hasNext());
|
||||
assertNull(i1.next());
|
||||
assertNull(i2.next());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -165,9 +165,9 @@ public class FSTCompletionTest extends LuceneTestCase {
|
|||
|
||||
// All the weights were constant, so all returned buckets must be constant, whatever they
|
||||
// are.
|
||||
Float previous = null;
|
||||
Long previous = null;
|
||||
for (TermFreq tf : keys) {
|
||||
Float current = (Float)lookup.get(_TestUtil.bytesToCharSequence(tf.term, random));
|
||||
Long current = ((Number)lookup.get(_TestUtil.bytesToCharSequence(tf.term, random))).longValue();
|
||||
if (previous != null) {
|
||||
assertEquals(previous, current);
|
||||
}
|
||||
|
@ -181,7 +181,7 @@ public class FSTCompletionTest extends LuceneTestCase {
|
|||
FSTCompletionLookup lookup = new FSTCompletionLookup();
|
||||
lookup.build(new TermFreqArrayIterator(input));
|
||||
for (TermFreq tf : input) {
|
||||
assertTrue("Not found: " + tf.term.toString(), lookup.get(_TestUtil.bytesToCharSequence(tf.term, random)) != null);
|
||||
assertNotNull("Not found: " + tf.term.toString(), lookup.get(_TestUtil.bytesToCharSequence(tf.term, random)));
|
||||
assertEquals(tf.term.utf8ToString(), lookup.lookup(_TestUtil.bytesToCharSequence(tf.term, random), true, 1).get(0).key.toString());
|
||||
}
|
||||
|
||||
|
|
|
@ -1,140 +0,0 @@
|
|||
package org.apache.lucene.search.suggest.fst;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.NumericUtils;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
public class FloatMagicTest extends LuceneTestCase {
|
||||
public void testFloatMagic() {
|
||||
ArrayList<Float> floats = new ArrayList<Float>(Arrays.asList(
|
||||
Float.intBitsToFloat(0x7f800001), // NaN (invalid combination).
|
||||
Float.intBitsToFloat(0x7fffffff), // NaN (invalid combination).
|
||||
Float.intBitsToFloat(0xff800001), // NaN (invalid combination).
|
||||
Float.intBitsToFloat(0xffffffff), // NaN (invalid combination).
|
||||
Float.POSITIVE_INFINITY,
|
||||
Float.MAX_VALUE,
|
||||
100f,
|
||||
0f,
|
||||
0.1f,
|
||||
Float.MIN_VALUE,
|
||||
Float.NaN,
|
||||
-0.0f,
|
||||
-Float.MIN_VALUE,
|
||||
-0.1f,
|
||||
-1f,
|
||||
-10f,
|
||||
Float.NEGATIVE_INFINITY));
|
||||
|
||||
// Sort them using juc.
|
||||
Collections.sort(floats);
|
||||
|
||||
// Convert to sortable int4 representation (as long to have an unsigned sort).
|
||||
long [] int4 = new long [floats.size()];
|
||||
for (int i = 0; i < floats.size(); i++) {
|
||||
int4[i] = FloatMagic.toSortable(floats.get(i)) & 0xffffffffL;
|
||||
|
||||
/*
|
||||
System.out.println(
|
||||
String.format("raw %8s sortable %8s %8s numutils %8s %s",
|
||||
Integer.toHexString(Float.floatToRawIntBits(floats.get(i))),
|
||||
Integer.toHexString(FloatMagic.toSortable(floats.get(i))),
|
||||
Integer.toHexString(FloatMagic.unsignedOrderedToFloatBits(FloatMagic.toSortable(floats.get(i)))),
|
||||
Integer.toHexString(NumericUtils.floatToSortableInt(floats.get(i))),
|
||||
floats.get(i)));
|
||||
*/
|
||||
}
|
||||
|
||||
// Sort and compare. Should be identical order.
|
||||
Arrays.sort(int4);
|
||||
ArrayList<Float> backFromFixed = new ArrayList<Float>();
|
||||
for (int i = 0; i < int4.length; i++) {
|
||||
backFromFixed.add(FloatMagic.fromSortable((int) int4[i]));
|
||||
}
|
||||
|
||||
/*
|
||||
for (int i = 0; i < int4.length; i++) {
|
||||
System.out.println(
|
||||
floats.get(i) + " " + FloatMagic.fromSortable((int) int4[i]));
|
||||
}
|
||||
*/
|
||||
|
||||
assertEquals(floats, backFromFixed);
|
||||
}
|
||||
|
||||
@Ignore("Once checked, valid forever?") @Test
|
||||
public void testRoundTripFullRange() {
|
||||
int i = 0;
|
||||
do {
|
||||
float f = Float.intBitsToFloat(i);
|
||||
float f2 = FloatMagic.fromSortable(FloatMagic.toSortable(f));
|
||||
|
||||
if (!((Float.isNaN(f) && Float.isNaN(f2)) || f == f2)) {
|
||||
throw new RuntimeException("! " + Integer.toHexString(i) + "> " + f + " " + f2);
|
||||
}
|
||||
|
||||
if ((i & 0xffffff) == 0) {
|
||||
System.out.println(Integer.toHexString(i));
|
||||
}
|
||||
|
||||
i++;
|
||||
} while (i != 0);
|
||||
}
|
||||
|
||||
@Ignore("Once checked, valid forever?") @Test
|
||||
public void testIncreasingFullRange() {
|
||||
// -infinity ... -0.0
|
||||
for (int i = 0xff800000; i != 0x80000000; i--) {
|
||||
checkSmaller(i, i - 1);
|
||||
}
|
||||
|
||||
// -0.0 +0.0
|
||||
checkSmaller(0x80000000, 0);
|
||||
|
||||
// +0.0 ... +infinity
|
||||
for (int i = 0; i != 0x7f800000; i++) {
|
||||
checkSmaller(i, i + 1);
|
||||
}
|
||||
|
||||
// All other are NaNs and should be after positive infinity.
|
||||
final long infinity = toSortableL(Float.POSITIVE_INFINITY);
|
||||
for (int i = 0x7f800001; i != 0x7fffffff; i++) {
|
||||
assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
|
||||
}
|
||||
for (int i = 0xff800001; i != 0xffffffff; i++) {
|
||||
assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
|
||||
}
|
||||
}
|
||||
|
||||
private long toSortableL(float f) {
|
||||
return FloatMagic.toSortable(f) & 0xffffffffL;
|
||||
}
|
||||
|
||||
private void checkSmaller(int i1, int i2) {
|
||||
float f1 = Float.intBitsToFloat(i1);
|
||||
float f2 = Float.intBitsToFloat(i2);
|
||||
if (f1 > f2) {
|
||||
throw new AssertionError(f1 + " " + f2 + " " + i1 + " " + i2);
|
||||
}
|
||||
assertTrue(toSortableL(f1) < toSortableL(f2));
|
||||
}
|
||||
}
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.search.suggest.fst;
|
|||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.search.suggest.fst.Sort.BufferSize;
|
||||
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
|
||||
|
@ -61,7 +62,7 @@ public class TestSort extends LuceneTestCase {
|
|||
@Test
|
||||
public void testIntermediateMerges() throws Exception {
|
||||
// Sort 20 mb worth of data with 1mb buffer, binary merging.
|
||||
SortInfo info = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), 2),
|
||||
SortInfo info = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), 2),
|
||||
generateRandom(Sort.MB * 20));
|
||||
assertTrue(info.mergeRounds > 10);
|
||||
}
|
||||
|
@ -69,7 +70,7 @@ public class TestSort extends LuceneTestCase {
|
|||
@Test
|
||||
public void testSmallRandom() throws Exception {
|
||||
// Sort 20 mb worth of data with 1mb buffer.
|
||||
SortInfo sortInfo = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
|
||||
SortInfo sortInfo = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
|
||||
generateRandom(Sort.MB * 20));
|
||||
assertEquals(1, sortInfo.mergeRounds);
|
||||
}
|
||||
|
@ -77,7 +78,7 @@ public class TestSort extends LuceneTestCase {
|
|||
@Test @Nightly
|
||||
public void testLargerRandom() throws Exception {
|
||||
// Sort 100MB worth of data with 15mb buffer.
|
||||
checkSort(new Sort(BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
|
||||
checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
|
||||
generateRandom(Sort.MB * 100));
|
||||
}
|
||||
|
||||
|
@ -93,13 +94,24 @@ public class TestSort extends LuceneTestCase {
|
|||
return bytes;
|
||||
}
|
||||
|
||||
static final Comparator<byte[]> unsignedByteOrderComparator = new Comparator<byte[]>() {
|
||||
public int compare(byte[] left, byte[] right) {
|
||||
final int max = Math.min(left.length, right.length);
|
||||
for (int i = 0, j = 0; i < max; i++, j++) {
|
||||
int diff = (left[i] & 0xff) - (right[j] & 0xff);
|
||||
if (diff != 0)
|
||||
return diff;
|
||||
}
|
||||
return left.length - right.length;
|
||||
}
|
||||
};
|
||||
/**
|
||||
* Check sorting data on an instance of {@link Sort}.
|
||||
*/
|
||||
private SortInfo checkSort(Sort sort, byte[][] data) throws IOException {
|
||||
File unsorted = writeAll("unsorted", data);
|
||||
|
||||
Arrays.sort(data, Sort.unsignedByteOrderComparator);
|
||||
Arrays.sort(data, unsignedByteOrderComparator);
|
||||
File golden = writeAll("golden", data);
|
||||
|
||||
File sorted = new File(tempDir, "sorted");
|
||||
|
|
|
@ -117,7 +117,7 @@ public class WFSTCompletionTest extends LuceneTestCase {
|
|||
// TODO: could be faster... but its slowCompletor for a reason
|
||||
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
|
||||
if (e.getKey().startsWith(prefix)) {
|
||||
matches.add(new LookupResult(e.getKey(), (float)e.getValue().longValue()));
|
||||
matches.add(new LookupResult(e.getKey(), e.getValue().longValue()));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -153,11 +153,6 @@ public class Suggester extends SolrSpellChecker {
|
|||
build(core, searcher);
|
||||
}
|
||||
|
||||
public void add(CharsRef query, int numHits) {
|
||||
LOG.info("add " + query + ", " + numHits);
|
||||
lookup.add(query, new Integer(numHits));
|
||||
}
|
||||
|
||||
static SpellingResult EMPTY_RESULT = new SpellingResult();
|
||||
|
||||
@Override
|
||||
|
@ -182,7 +177,7 @@ public class Suggester extends SolrSpellChecker {
|
|||
Collections.sort(suggestions);
|
||||
}
|
||||
for (LookupResult lr : suggestions) {
|
||||
res.add(t, lr.key.toString(), ((Number)lr.value).intValue());
|
||||
res.add(t, lr.key.toString(), (int)lr.value);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
|
|
Loading…
Reference in New Issue