mirror of https://github.com/apache/lucene.git
LUCENE-3807: Cleanup Suggest / Lookup API
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1296268 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2c94c522fd
commit
f303bcd465
|
@ -115,6 +115,13 @@ Changes in backwards compatibility policy
|
||||||
* LUCENE-3626: The internal implementation classes in PKIndexSplitter
|
* LUCENE-3626: The internal implementation classes in PKIndexSplitter
|
||||||
and MultiPassIndexSplitter were made private as they now work
|
and MultiPassIndexSplitter were made private as they now work
|
||||||
per segment. (Uwe Schindler)
|
per segment. (Uwe Schindler)
|
||||||
|
|
||||||
|
* LUCENE-3807: Cleaned up Suggest / Lookup API. Term weights (freqs) are now
|
||||||
|
64bit signed integers instead of 32bit floats. Sorting of terms is now a
|
||||||
|
disk based merge sort instead of an in-memory sort. The Lookup API now
|
||||||
|
accepts and returns CharSequence instead of String which should be converted
|
||||||
|
into a String before used in a datastructure that relies on hashCode / equals.
|
||||||
|
(Simon Willnauer)
|
||||||
|
|
||||||
Changes in Runtime Behavior
|
Changes in Runtime Behavior
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This wrapper buffers incoming elements.
|
* This wrapper buffers incoming elements.
|
||||||
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class BufferingTermFreqIteratorWrapper implements TermFreqIterator {
|
public class BufferingTermFreqIteratorWrapper implements TermFreqIterator {
|
||||||
// TODO keep this for now
|
// TODO keep this for now
|
||||||
|
|
|
@ -18,81 +18,113 @@ package org.apache.lucene.search.suggest;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
|
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.ByteBlockPool;
|
import org.apache.lucene.util.ByteBlockPool;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefIterator;
|
import org.apache.lucene.util.BytesRefIterator;
|
||||||
|
import org.apache.lucene.util.Counter;
|
||||||
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
import org.apache.lucene.util.SorterTemplate;
|
import org.apache.lucene.util.SorterTemplate;
|
||||||
|
|
||||||
final class BytesRefList {
|
/**
|
||||||
|
* A simple append only random-access {@link BytesRef} array that stores full
|
||||||
|
* copies of the appended bytes in a {@link ByteBlockPool}.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* <b>Note: This class is not Thread-Safe!</b>
|
||||||
|
*
|
||||||
|
* @lucene.internal
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public final class BytesRefList {
|
||||||
|
// TODO rename to BytesRefArray
|
||||||
private final ByteBlockPool pool;
|
private final ByteBlockPool pool;
|
||||||
private int[] offsets = new int[1];
|
private int[] offsets = new int[1];
|
||||||
private int currentElement = 0;
|
private int lastElement = 0;
|
||||||
private int currentOffset = 0;
|
private int currentOffset = 0;
|
||||||
|
private final Counter bytesUsed = Counter.newCounter(false);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new {@link BytesRefList}
|
||||||
|
*/
|
||||||
public BytesRefList() {
|
public BytesRefList() {
|
||||||
this(new ByteBlockPool(new ByteBlockPool.DirectAllocator()));
|
this.pool = new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(
|
||||||
}
|
bytesUsed));
|
||||||
|
|
||||||
public BytesRefList(ByteBlockPool pool) {
|
|
||||||
this.pool = pool;
|
|
||||||
pool.nextBuffer();
|
pool.nextBuffer();
|
||||||
|
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER
|
||||||
|
+ RamUsageEstimator.NUM_BYTES_INT);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int append(BytesRef bytes) {
|
/**
|
||||||
if (currentElement >= offsets.length) {
|
* Clears this {@link BytesRefList}
|
||||||
offsets = ArrayUtil.grow(offsets, offsets.length + 1);
|
*/
|
||||||
}
|
public void clear() {
|
||||||
pool.copy(bytes);
|
lastElement = 0;
|
||||||
offsets[currentElement++] = currentOffset;
|
currentOffset = 0;
|
||||||
currentOffset += bytes.length;
|
Arrays.fill(offsets, 0);
|
||||||
return currentElement;
|
pool.reset();
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
|
||||||
return currentElement;
|
|
||||||
}
|
|
||||||
|
|
||||||
public BytesRef get(BytesRef bytes, int pos) {
|
|
||||||
if (currentElement > pos) {
|
|
||||||
bytes.offset = offsets[pos];
|
|
||||||
bytes.length = pos == currentElement - 1 ? currentOffset - bytes.offset
|
|
||||||
: offsets[pos + 1] - bytes.offset;
|
|
||||||
pool.copyFrom(bytes);
|
|
||||||
return bytes;
|
|
||||||
}
|
|
||||||
throw new IndexOutOfBoundsException("index " + pos
|
|
||||||
+ " must be less than the size: " + currentElement);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public BytesRefIterator iterator() {
|
|
||||||
final int numElements = currentElement;
|
|
||||||
|
|
||||||
return new BytesRefIterator() {
|
|
||||||
private final BytesRef spare = new BytesRef();
|
|
||||||
private int pos = 0;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public BytesRef next() throws IOException {
|
|
||||||
if (pos < numElements) {
|
|
||||||
get(spare, pos++);
|
|
||||||
return spare;
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Comparator<BytesRef> getComparator() {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int[] sort(final Comparator<BytesRef> comp) {
|
/**
|
||||||
|
* Appends a copy of the given {@link BytesRef} to this {@link BytesRefList}.
|
||||||
|
* @param bytes the bytes to append
|
||||||
|
* @return the ordinal of the appended bytes
|
||||||
|
*/
|
||||||
|
public int append(BytesRef bytes) {
|
||||||
|
if (lastElement >= offsets.length) {
|
||||||
|
int oldLen = offsets.length;
|
||||||
|
offsets = ArrayUtil.grow(offsets, offsets.length + 1);
|
||||||
|
bytesUsed.addAndGet((offsets.length - oldLen)
|
||||||
|
* RamUsageEstimator.NUM_BYTES_INT);
|
||||||
|
}
|
||||||
|
pool.copy(bytes);
|
||||||
|
offsets[lastElement++] = currentOffset;
|
||||||
|
currentOffset += bytes.length;
|
||||||
|
return lastElement;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the current size of this {@link BytesRefList}
|
||||||
|
* @return the current size of this {@link BytesRefList}
|
||||||
|
*/
|
||||||
|
public int size() {
|
||||||
|
return lastElement;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the <i>n'th</i> element of this {@link BytesRefList}
|
||||||
|
* @param spare a spare {@link BytesRef} instance
|
||||||
|
* @param ord the elements ordinal to retrieve
|
||||||
|
* @return the <i>n'th</i> element of this {@link BytesRefList}
|
||||||
|
*/
|
||||||
|
public BytesRef get(BytesRef spare, int ord) {
|
||||||
|
if (lastElement > ord) {
|
||||||
|
spare.offset = offsets[ord];
|
||||||
|
spare.length = ord == lastElement - 1 ? currentOffset - spare.offset
|
||||||
|
: offsets[ord + 1] - spare.offset;
|
||||||
|
pool.copyFrom(spare);
|
||||||
|
return spare;
|
||||||
|
}
|
||||||
|
throw new IndexOutOfBoundsException("index " + ord
|
||||||
|
+ " must be less than the size: " + lastElement);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number internally used bytes to hold the appended bytes in
|
||||||
|
* memory
|
||||||
|
*
|
||||||
|
* @return the number internally used bytes to hold the appended bytes in
|
||||||
|
* memory
|
||||||
|
*/
|
||||||
|
public long bytesUsed() {
|
||||||
|
return bytesUsed.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
private int[] sort(final Comparator<BytesRef> comp) {
|
||||||
final int[] orderdEntries = new int[size()];
|
final int[] orderdEntries = new int[size()];
|
||||||
for (int i = 0; i < orderdEntries.length; i++) {
|
for (int i = 0; i < orderdEntries.length; i++) {
|
||||||
orderdEntries[i] = i;
|
orderdEntries[i] = i;
|
||||||
|
@ -110,22 +142,65 @@ final class BytesRefList {
|
||||||
final int ord1 = orderdEntries[i], ord2 = orderdEntries[j];
|
final int ord1 = orderdEntries[i], ord2 = orderdEntries[j];
|
||||||
return comp.compare(get(scratch1, ord1), get(scratch2, ord2));
|
return comp.compare(get(scratch1, ord1), get(scratch2, ord2));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void setPivot(int i) {
|
protected void setPivot(int i) {
|
||||||
final int ord = orderdEntries[i];
|
final int ord = orderdEntries[i];
|
||||||
get(pivot, ord);
|
get(pivot, ord);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected int comparePivot(int j) {
|
protected int comparePivot(int j) {
|
||||||
final int ord = orderdEntries[j];
|
final int ord = orderdEntries[j];
|
||||||
return comp.compare(pivot, get(scratch2, ord));
|
return comp.compare(pivot, get(scratch2, ord));
|
||||||
}
|
}
|
||||||
|
|
||||||
private final BytesRef pivot = new BytesRef(),
|
private final BytesRef pivot = new BytesRef(), scratch1 = new BytesRef(),
|
||||||
scratch1 = new BytesRef(), scratch2 = new BytesRef();
|
scratch2 = new BytesRef();
|
||||||
}.quickSort(0, size() - 1);
|
}.quickSort(0, size() - 1);
|
||||||
return orderdEntries;
|
return orderdEntries;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* sugar for {@link #iterator(Comparator)} with a <code>null</code> comparator
|
||||||
|
*/
|
||||||
|
public BytesRefIterator iterator() {
|
||||||
|
return iterator(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* Returns a {@link BytesRefIterator} with point in time semantics. The
|
||||||
|
* iterator provides access to all so far appended {@link BytesRef} instances.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* If a non <code>null</code> {@link Comparator} is provided the iterator will
|
||||||
|
* iterate the byte values in the order specified by the comparator. Otherwise
|
||||||
|
* the order is the same as the values were appended.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* This is a non-destructive operation.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public BytesRefIterator iterator(final Comparator<BytesRef> comp) {
|
||||||
|
final BytesRef spare = new BytesRef();
|
||||||
|
final int size = size();
|
||||||
|
final int[] ords = comp == null ? null : sort(comp);
|
||||||
|
return new BytesRefIterator() {
|
||||||
|
int pos = 0;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef next() throws IOException {
|
||||||
|
if (pos < size) {
|
||||||
|
return get(spare, ords == null ? pos++ : ords[pos++]);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Comparator<BytesRef> getComparator() {
|
||||||
|
return comp;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -75,7 +75,11 @@ public class FileDictionary implements Dictionary {
|
||||||
String[] fields = line.split("\t");
|
String[] fields = line.split("\t");
|
||||||
if (fields.length > 1) {
|
if (fields.length > 1) {
|
||||||
// keep reading floats for bw compat
|
// keep reading floats for bw compat
|
||||||
curFreq = (int)Float.parseFloat(fields[1]);
|
try {
|
||||||
|
curFreq = Long.parseLong(fields[1]);
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
curFreq = (long)Double.parseDouble(fields[1]);
|
||||||
|
}
|
||||||
spare.copyChars(fields[0]);
|
spare.copyChars(fields[0]);
|
||||||
} else {
|
} else {
|
||||||
spare.copyChars(line);
|
spare.copyChars(line);
|
||||||
|
|
|
@ -29,15 +29,19 @@ import org.apache.lucene.search.spell.TermFreqIterator;
|
||||||
import org.apache.lucene.util.BytesRefIterator;
|
import org.apache.lucene.util.BytesRefIterator;
|
||||||
import org.apache.lucene.util.PriorityQueue;
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple Lookup interface for {@link CharSequence} suggestions.
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
public abstract class Lookup {
|
public abstract class Lookup {
|
||||||
/**
|
/**
|
||||||
* Result of a lookup.
|
* Result of a lookup.
|
||||||
*/
|
*/
|
||||||
public static final class LookupResult implements Comparable<LookupResult> {
|
public static final class LookupResult implements Comparable<LookupResult> {
|
||||||
public final CharSequence key;
|
public final CharSequence key;
|
||||||
public final float value;
|
public final long value;
|
||||||
|
|
||||||
public LookupResult(CharSequence key, float value) {
|
public LookupResult(CharSequence key, long value) {
|
||||||
this.key = key;
|
this.key = key;
|
||||||
this.value = value;
|
this.value = value;
|
||||||
}
|
}
|
||||||
|
@ -112,6 +116,10 @@ public abstract class Lookup {
|
||||||
build(tfit);
|
build(tfit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds up a new internal {@link Lookup} representation based on the given {@link TermFreqIterator}.
|
||||||
|
* The implementation might re-sort the data internally.
|
||||||
|
*/
|
||||||
public abstract void build(TermFreqIterator tfit) throws IOException;
|
public abstract void build(TermFreqIterator tfit) throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -124,22 +132,7 @@ public abstract class Lookup {
|
||||||
*/
|
*/
|
||||||
public abstract List<LookupResult> lookup(CharSequence key, boolean onlyMorePopular, int num);
|
public abstract List<LookupResult> lookup(CharSequence key, boolean onlyMorePopular, int num);
|
||||||
|
|
||||||
/**
|
|
||||||
* Modify the lookup data by recording additional data. Optional operation.
|
|
||||||
* @param key new lookup key
|
|
||||||
* @param value value to associate with this key
|
|
||||||
* @return true if new key is added, false if it already exists or operation
|
|
||||||
* is not supported.
|
|
||||||
*/
|
|
||||||
public abstract boolean add(CharSequence key, Object value);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get value associated with a specific key.
|
|
||||||
* @param key lookup key
|
|
||||||
* @return associated value
|
|
||||||
*/
|
|
||||||
public abstract Object get(CharSequence key);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Persist the constructed lookup data to a directory. Optional operation.
|
* Persist the constructed lookup data to a directory. Optional operation.
|
||||||
* @param output {@link OutputStream} to write the data to.
|
* @param output {@link OutputStream} to write the data to.
|
||||||
|
@ -173,4 +166,5 @@ public abstract class Lookup {
|
||||||
* @throws IOException when fatal IO error occurs.
|
* @throws IOException when fatal IO error occurs.
|
||||||
*/
|
*/
|
||||||
public abstract boolean load(File storeDir) throws IOException;
|
public abstract boolean load(File storeDir) throws IOException;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,45 +17,166 @@ package org.apache.lucene.search.suggest;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
|
|
||||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||||
|
import org.apache.lucene.search.suggest.fst.Sort;
|
||||||
|
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
|
||||||
|
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This wrapper buffers incoming elements and makes sure they are sorted in
|
* This wrapper buffers incoming elements and makes sure they are sorted based on given comparator.
|
||||||
* ascending lexicographic order.
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper {
|
public class SortedTermFreqIteratorWrapper implements TermFreqIterator {
|
||||||
// TODO keep this for now - but the consumer should really sort this stuff on disk with sorter...
|
|
||||||
private final int[] sortedOrds;
|
private final TermFreqIterator source;
|
||||||
private int currentOrd = -1;
|
private File tempInput;
|
||||||
private final BytesRef spare = new BytesRef();
|
private File tempSorted;
|
||||||
private final Comparator<BytesRef> comp;
|
private final ByteSequencesReader reader;
|
||||||
|
private boolean done = false;
|
||||||
public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comp) throws IOException {
|
|
||||||
super(source);
|
private long weight;
|
||||||
this.sortedOrds = entries.sort(comp);
|
private final BytesRef scratch = new BytesRef();
|
||||||
this.comp = comp;
|
private final Comparator<BytesRef> comparator;
|
||||||
|
|
||||||
|
public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator) throws IOException {
|
||||||
|
this(source, comparator, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator, boolean compareRawBytes) throws IOException {
|
||||||
public long weight() {
|
this.source = source;
|
||||||
return freqs[currentOrd];
|
this.comparator = comparator;
|
||||||
|
this.reader = sort(compareRawBytes ? comparator : new BytesOnlyComparator(this.comparator));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public BytesRef next() throws IOException {
|
public BytesRef next() throws IOException {
|
||||||
if (++curPos < entries.size()) {
|
boolean success = false;
|
||||||
return entries.get(spare, (currentOrd = sortedOrds[curPos]));
|
if (done) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
ByteArrayDataInput input = new ByteArrayDataInput();
|
||||||
|
if (reader.read(scratch)) {
|
||||||
|
weight = decode(scratch, input);
|
||||||
|
success = true;
|
||||||
|
return scratch;
|
||||||
|
}
|
||||||
|
close();
|
||||||
|
success = done = true;
|
||||||
|
return null;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
done = true;
|
||||||
|
close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Comparator<BytesRef> getComparator() {
|
public Comparator<BytesRef> getComparator() {
|
||||||
return comp;
|
return comparator;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long weight() {
|
||||||
|
return weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Sort.ByteSequencesReader sort(Comparator<BytesRef> comparator) throws IOException {
|
||||||
|
String prefix = getClass().getSimpleName();
|
||||||
|
File directory = Sort.defaultTempDir();
|
||||||
|
tempInput = File.createTempFile(prefix, ".input", directory);
|
||||||
|
tempSorted = File.createTempFile(prefix, ".sorted", directory);
|
||||||
|
|
||||||
|
final Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
BytesRef spare;
|
||||||
|
byte[] buffer = new byte[0];
|
||||||
|
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
|
||||||
|
|
||||||
|
while ((spare = source.next()) != null) {
|
||||||
|
encode(writer, output, buffer, spare, source.weight());
|
||||||
|
}
|
||||||
|
writer.close();
|
||||||
|
new Sort(comparator).sort(tempInput, tempSorted);
|
||||||
|
ByteSequencesReader reader = new Sort.ByteSequencesReader(tempSorted);
|
||||||
|
success = true;
|
||||||
|
return reader;
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
if (success) {
|
||||||
|
IOUtils.close(writer);
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
IOUtils.closeWhileHandlingException(writer);
|
||||||
|
} finally {
|
||||||
|
close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void close() throws IOException {
|
||||||
|
if (tempInput != null) {
|
||||||
|
tempInput.delete();
|
||||||
|
}
|
||||||
|
if (tempSorted != null) {
|
||||||
|
tempSorted.delete();
|
||||||
|
}
|
||||||
|
IOUtils.close(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
private final static class BytesOnlyComparator implements Comparator<BytesRef> {
|
||||||
|
|
||||||
|
final Comparator<BytesRef> other;
|
||||||
|
private final BytesRef leftScratch = new BytesRef();
|
||||||
|
private final BytesRef rightScratch = new BytesRef();
|
||||||
|
|
||||||
|
public BytesOnlyComparator(Comparator<BytesRef> other) {
|
||||||
|
this.other = other;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(BytesRef left, BytesRef right) {
|
||||||
|
wrap(leftScratch, left);
|
||||||
|
wrap(rightScratch, right);
|
||||||
|
return other.compare(leftScratch, rightScratch);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void wrap(BytesRef wrapper, BytesRef source) {
|
||||||
|
wrapper.bytes = source.bytes;
|
||||||
|
wrapper.offset = source.offset;
|
||||||
|
wrapper.length = source.length - 8;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException {
|
||||||
|
if (spare.length + 8 >= buffer.length) {
|
||||||
|
buffer = ArrayUtil.grow(buffer, spare.length + 8);
|
||||||
|
}
|
||||||
|
output.reset(buffer);
|
||||||
|
output.writeBytes(spare.bytes, spare.offset, spare.length);
|
||||||
|
output.writeLong(weight);
|
||||||
|
writer.write(buffer, 0, output.getPosition());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
|
||||||
|
tmpInput.reset(scratch.bytes);
|
||||||
|
tmpInput.skipBytes(scratch.length - 8); // suggestion + separator
|
||||||
|
scratch.length -= 8; // sep + long
|
||||||
|
return tmpInput.readLong();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.util.BytesRef;
|
||||||
/**
|
/**
|
||||||
* This wrapper buffers the incoming elements and makes sure they are in
|
* This wrapper buffers the incoming elements and makes sure they are in
|
||||||
* random order.
|
* random order.
|
||||||
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper {
|
public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper {
|
||||||
// TODO keep this for now
|
// TODO keep this for now
|
||||||
|
|
|
@ -18,13 +18,16 @@ package org.apache.lucene.search.suggest.fst;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Iterator;
|
import java.util.Comparator;
|
||||||
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefIterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Collects {@link BytesRef} and then allows one to iterate over their sorted order. Implementations
|
* Collects {@link BytesRef} and then allows one to iterate over their sorted order. Implementations
|
||||||
* of this interface will be called in a single-threaded scenario.
|
* of this interface will be called in a single-threaded scenario.
|
||||||
|
* @lucene.experimental
|
||||||
|
* @lucene.internal
|
||||||
*/
|
*/
|
||||||
public interface BytesRefSorter {
|
public interface BytesRefSorter {
|
||||||
/**
|
/**
|
||||||
|
@ -42,5 +45,7 @@ public interface BytesRefSorter {
|
||||||
*
|
*
|
||||||
* @throws IOException If an I/O exception occurs.
|
* @throws IOException If an I/O exception occurs.
|
||||||
*/
|
*/
|
||||||
Iterator<BytesRef> iterator() throws IOException;
|
BytesRefIterator iterator() throws IOException;
|
||||||
|
|
||||||
|
Comparator<BytesRef> getComparator();
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,59 +18,63 @@ package org.apache.lucene.search.suggest.fst;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.Iterator;
|
import java.util.Comparator;
|
||||||
import java.util.NoSuchElementException;
|
|
||||||
|
|
||||||
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
|
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefIterator;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds and iterates over sequences stored on disk.
|
* Builds and iterates over sequences stored on disk.
|
||||||
|
* @lucene.experimental
|
||||||
|
* @lucene.internal
|
||||||
*/
|
*/
|
||||||
public class ExternalRefSorter implements BytesRefSorter, Closeable {
|
public class ExternalRefSorter implements BytesRefSorter, Closeable {
|
||||||
private final Sort sort;
|
private final Sort sort;
|
||||||
private Sort.ByteSequencesWriter writer;
|
private Sort.ByteSequencesWriter writer;
|
||||||
private File input;
|
private File input;
|
||||||
private File sorted;
|
private File sorted;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Will buffer all sequences to a temporary file and then sort (all on-disk).
|
* Will buffer all sequences to a temporary file and then sort (all on-disk).
|
||||||
*/
|
*/
|
||||||
public ExternalRefSorter(Sort sort) throws IOException {
|
public ExternalRefSorter(Sort sort) throws IOException {
|
||||||
this.sort = sort;
|
this.sort = sort;
|
||||||
this.input = File.createTempFile("RefSorter-", ".raw", Sort.defaultTempDir());
|
this.input = File.createTempFile("RefSorter-", ".raw",
|
||||||
|
Sort.defaultTempDir());
|
||||||
this.writer = new Sort.ByteSequencesWriter(input);
|
this.writer = new Sort.ByteSequencesWriter(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void add(BytesRef utf8) throws IOException {
|
public void add(BytesRef utf8) throws IOException {
|
||||||
if (writer == null)
|
if (writer == null) throw new IllegalStateException();
|
||||||
throw new IllegalStateException();
|
|
||||||
writer.write(utf8);
|
writer.write(utf8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
public BytesRefIterator iterator() throws IOException {
|
||||||
public Iterator<BytesRef> iterator() throws IOException {
|
|
||||||
if (sorted == null) {
|
if (sorted == null) {
|
||||||
closeWriter();
|
closeWriter();
|
||||||
|
|
||||||
sorted = File.createTempFile("RefSorter-", ".sorted", Sort.defaultTempDir());
|
sorted = File.createTempFile("RefSorter-", ".sorted",
|
||||||
|
Sort.defaultTempDir());
|
||||||
sort.sort(input, sorted);
|
sort.sort(input, sorted);
|
||||||
|
|
||||||
input.delete();
|
input.delete();
|
||||||
input = null;
|
input = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted));
|
return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted),
|
||||||
|
sort.getComparator());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void closeWriter() throws IOException {
|
private void closeWriter() throws IOException {
|
||||||
if (writer != null) {
|
if (writer != null) {
|
||||||
writer.close();
|
writer.close();
|
||||||
writer = null;
|
writer = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes any written temporary files.
|
* Removes any written temporary files.
|
||||||
*/
|
*/
|
||||||
|
@ -83,40 +87,54 @@ public class ExternalRefSorter implements BytesRefSorter, Closeable {
|
||||||
if (sorted != null) sorted.delete();
|
if (sorted != null) sorted.delete();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Iterate over byte refs in a file.
|
* Iterate over byte refs in a file.
|
||||||
*/
|
*/
|
||||||
class ByteSequenceIterator implements Iterator<BytesRef> {
|
class ByteSequenceIterator implements BytesRefIterator {
|
||||||
private ByteSequencesReader reader;
|
private final ByteSequencesReader reader;
|
||||||
private byte[] next;
|
private BytesRef scratch = new BytesRef();
|
||||||
|
private final Comparator<BytesRef> comparator;
|
||||||
public ByteSequenceIterator(ByteSequencesReader reader) throws IOException {
|
|
||||||
|
public ByteSequenceIterator(ByteSequencesReader reader,
|
||||||
|
Comparator<BytesRef> comparator) {
|
||||||
this.reader = reader;
|
this.reader = reader;
|
||||||
this.next = reader.read();
|
this.comparator = comparator;
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean hasNext() {
|
|
||||||
return next != null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public BytesRef next() {
|
public BytesRef next() throws IOException {
|
||||||
if (next == null) throw new NoSuchElementException();
|
if (scratch == null) {
|
||||||
BytesRef r = new BytesRef(next);
|
return null;
|
||||||
try {
|
}
|
||||||
next = reader.read();
|
boolean success = false;
|
||||||
if (next == null) {
|
try {
|
||||||
reader.close();
|
byte[] next = reader.read();
|
||||||
}
|
if (next != null) {
|
||||||
} catch (IOException e) {
|
scratch.bytes = next;
|
||||||
throw new RuntimeException(e);
|
scratch.length = next.length;
|
||||||
|
scratch.offset = 0;
|
||||||
|
} else {
|
||||||
|
IOUtils.close(reader);
|
||||||
|
scratch = null;
|
||||||
|
}
|
||||||
|
success = true;
|
||||||
|
return scratch;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(reader);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return r;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void remove() { throw new UnsupportedOperationException(); }
|
public Comparator<BytesRef> getComparator() {
|
||||||
|
return comparator;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Comparator<BytesRef> getComparator() {
|
||||||
|
return sort.getComparator();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.util.fst.FST.Arc;
|
||||||
* Finite state automata based implementation of "autocomplete" functionality.
|
* Finite state automata based implementation of "autocomplete" functionality.
|
||||||
*
|
*
|
||||||
* @see FSTCompletionBuilder
|
* @see FSTCompletionBuilder
|
||||||
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// TODO: we could store exact weights as outputs from the FST (int4 encoded
|
// TODO: we could store exact weights as outputs from the FST (int4 encoded
|
||||||
|
@ -159,10 +160,10 @@ public class FSTCompletion {
|
||||||
* @param utf8
|
* @param utf8
|
||||||
* The sequence of utf8 bytes to follow.
|
* The sequence of utf8 bytes to follow.
|
||||||
*
|
*
|
||||||
* @return Returns the bucket number of the match or <code>null</code> if no
|
* @return Returns the bucket number of the match or <code>-1</code> if no
|
||||||
* match was found.
|
* match was found.
|
||||||
*/
|
*/
|
||||||
private Integer getExactMatchStartingFromRootArc(
|
private int getExactMatchStartingFromRootArc(
|
||||||
int rootArcIndex, BytesRef utf8) {
|
int rootArcIndex, BytesRef utf8) {
|
||||||
// Get the UTF-8 bytes representation of the input key.
|
// Get the UTF-8 bytes representation of the input key.
|
||||||
try {
|
try {
|
||||||
|
@ -186,7 +187,7 @@ public class FSTCompletion {
|
||||||
}
|
}
|
||||||
|
|
||||||
// No match.
|
// No match.
|
||||||
return null;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -273,8 +274,8 @@ public class FSTCompletion {
|
||||||
// exact match, if requested.
|
// exact match, if requested.
|
||||||
if (exactFirst) {
|
if (exactFirst) {
|
||||||
if (!checkExistingAndReorder(res, key)) {
|
if (!checkExistingAndReorder(res, key)) {
|
||||||
Integer exactMatchBucket = getExactMatchStartingFromRootArc(i, key);
|
int exactMatchBucket = getExactMatchStartingFromRootArc(i, key);
|
||||||
if (exactMatchBucket != null) {
|
if (exactMatchBucket != -1) {
|
||||||
// Insert as the first result and truncate at num.
|
// Insert as the first result and truncate at num.
|
||||||
while (res.size() >= num) {
|
while (res.size() >= num) {
|
||||||
res.remove(res.size() - 1);
|
res.remove(res.size() - 1);
|
||||||
|
@ -385,10 +386,10 @@ public class FSTCompletion {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the bucket assigned to a given key (if found) or <code>null</code> if
|
* Returns the bucket assigned to a given key (if found) or <code>-1</code> if
|
||||||
* no exact match exists.
|
* no exact match exists.
|
||||||
*/
|
*/
|
||||||
public Integer getBucket(CharSequence key) {
|
public int getBucket(CharSequence key) {
|
||||||
return getExactMatchStartingFromRootArc(0, new BytesRef(key));
|
return getExactMatchStartingFromRootArc(0, new BytesRef(key));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,9 +19,9 @@ package org.apache.lucene.search.suggest.fst;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Iterator;
|
|
||||||
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefIterator;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.fst.*;
|
import org.apache.lucene.util.fst.*;
|
||||||
|
|
||||||
|
@ -98,6 +98,7 @@ import org.apache.lucene.util.fst.*;
|
||||||
* change, requiring you to rebuild the FST suggest index.
|
* change, requiring you to rebuild the FST suggest index.
|
||||||
*
|
*
|
||||||
* @see FSTCompletion
|
* @see FSTCompletion
|
||||||
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class FSTCompletionBuilder {
|
public class FSTCompletionBuilder {
|
||||||
/**
|
/**
|
||||||
|
@ -143,10 +144,11 @@ public class FSTCompletionBuilder {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates an {@link FSTCompletion} with default options: 10 buckets, exact match
|
* Creates an {@link FSTCompletion} with default options: 10 buckets, exact match
|
||||||
* promoted to first position and {@link InMemorySorter}.
|
* promoted to first position and {@link InMemorySorter} with a comparator obtained from
|
||||||
|
* {@link BytesRef#getUTF8SortedAsUnicodeComparator()}.
|
||||||
*/
|
*/
|
||||||
public FSTCompletionBuilder() {
|
public FSTCompletionBuilder() {
|
||||||
this(DEFAULT_BUCKETS, new InMemorySorter(), Integer.MAX_VALUE);
|
this(DEFAULT_BUCKETS, new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator()), Integer.MAX_VALUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -237,10 +239,12 @@ public class FSTCompletionBuilder {
|
||||||
shareMaxTailLength, outputs, null, false);
|
shareMaxTailLength, outputs, null, false);
|
||||||
|
|
||||||
BytesRef scratch = new BytesRef();
|
BytesRef scratch = new BytesRef();
|
||||||
|
BytesRef entry;
|
||||||
final IntsRef scratchIntsRef = new IntsRef();
|
final IntsRef scratchIntsRef = new IntsRef();
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (Iterator<BytesRef> i = sorter.iterator(); i.hasNext(); count++) {
|
BytesRefIterator iter = sorter.iterator();
|
||||||
BytesRef entry = i.next();
|
while((entry = iter.next()) != null) {
|
||||||
|
count++;
|
||||||
if (scratch.compareTo(entry) != 0) {
|
if (scratch.compareTo(entry) != 0) {
|
||||||
builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
|
builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
|
||||||
scratch.copyBytes(entry);
|
scratch.copyBytes(entry);
|
||||||
|
|
|
@ -59,6 +59,7 @@ import org.apache.lucene.util.fst.NoOutputs;
|
||||||
* use {@link FSTCompletion} directly or {@link TSTLookup}, for example.
|
* use {@link FSTCompletion} directly or {@link TSTLookup}, for example.
|
||||||
*
|
*
|
||||||
* @see FSTCompletion
|
* @see FSTCompletion
|
||||||
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class FSTCompletionLookup extends Lookup {
|
public class FSTCompletionLookup extends Lookup {
|
||||||
/**
|
/**
|
||||||
|
@ -171,7 +172,7 @@ public class FSTCompletionLookup extends Lookup {
|
||||||
}
|
}
|
||||||
|
|
||||||
output.reset(buffer);
|
output.reset(buffer);
|
||||||
output.writeInt(FloatMagic.toSortable(tfit.weight()));
|
output.writeInt(encodeWeight(tfit.weight()));
|
||||||
output.writeBytes(spare.bytes, spare.offset, spare.length);
|
output.writeBytes(spare.bytes, spare.offset, spare.length);
|
||||||
writer.write(buffer, 0, output.getPosition());
|
writer.write(buffer, 0, output.getPosition());
|
||||||
}
|
}
|
||||||
|
@ -188,13 +189,13 @@ public class FSTCompletionLookup extends Lookup {
|
||||||
reader = new Sort.ByteSequencesReader(tempSorted);
|
reader = new Sort.ByteSequencesReader(tempSorted);
|
||||||
long line = 0;
|
long line = 0;
|
||||||
int previousBucket = 0;
|
int previousBucket = 0;
|
||||||
float previousScore = 0;
|
int previousScore = 0;
|
||||||
ByteArrayDataInput input = new ByteArrayDataInput();
|
ByteArrayDataInput input = new ByteArrayDataInput();
|
||||||
BytesRef tmp1 = new BytesRef();
|
BytesRef tmp1 = new BytesRef();
|
||||||
BytesRef tmp2 = new BytesRef();
|
BytesRef tmp2 = new BytesRef();
|
||||||
while (reader.read(tmp1)) {
|
while (reader.read(tmp1)) {
|
||||||
input.reset(tmp1.bytes);
|
input.reset(tmp1.bytes);
|
||||||
float currentScore = FloatMagic.fromSortable(input.readInt());
|
int currentScore = input.readInt();
|
||||||
|
|
||||||
int bucket;
|
int bucket;
|
||||||
if (line > 0 && currentScore == previousScore) {
|
if (line > 0 && currentScore == previousScore) {
|
||||||
|
@ -230,6 +231,14 @@ public class FSTCompletionLookup extends Lookup {
|
||||||
tempSorted.delete();
|
tempSorted.delete();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** weight -> cost */
|
||||||
|
private static int encodeWeight(long value) {
|
||||||
|
if (value < Integer.MIN_VALUE || value > Integer.MAX_VALUE) {
|
||||||
|
throw new UnsupportedOperationException("cannot encode value: " + value);
|
||||||
|
}
|
||||||
|
return (int)value;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<LookupResult> lookup(CharSequence key, boolean higherWeightsFirst, int num) {
|
public List<LookupResult> lookup(CharSequence key, boolean higherWeightsFirst, int num) {
|
||||||
|
@ -250,19 +259,9 @@ public class FSTCompletionLookup extends Lookup {
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean add(CharSequence key, Object value) {
|
|
||||||
// Not supported.
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Object get(CharSequence key) {
|
public Object get(CharSequence key) {
|
||||||
Integer bucket = normalCompletion.getBucket(key);
|
final int bucket = normalCompletion.getBucket(key);
|
||||||
if (bucket == null)
|
return bucket == -1 ? null : Long.valueOf(bucket);
|
||||||
return null;
|
|
||||||
else
|
|
||||||
return (float) normalCompletion.getBucket(key) / normalCompletion.getBucketCount();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -1,75 +0,0 @@
|
||||||
package org.apache.lucene.search.suggest.fst;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import org.apache.lucene.util.NumericUtils;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Converts normalized float representations ({@link Float#floatToIntBits(float)})
|
|
||||||
* into integers that are directly sortable in int4 representation (or unsigned values or
|
|
||||||
* after promoting to a long with higher 32-bits zeroed).
|
|
||||||
*/
|
|
||||||
class FloatMagic {
|
|
||||||
/**
|
|
||||||
* Convert a float to a directly sortable unsigned integer. For sortable signed
|
|
||||||
* integers, see {@link NumericUtils#floatToSortableInt(float)}.
|
|
||||||
*/
|
|
||||||
public static int toSortable(float f) {
|
|
||||||
return floatBitsToUnsignedOrdered(Float.floatToRawIntBits(f));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Back from {@link #toSortable(float)} to float.
|
|
||||||
*/
|
|
||||||
public static float fromSortable(int v) {
|
|
||||||
return Float.intBitsToFloat(unsignedOrderedToFloatBits(v));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convert float bits to directly sortable bits.
|
|
||||||
* Normalizes all NaNs to canonical form.
|
|
||||||
*/
|
|
||||||
static int floatBitsToUnsignedOrdered(int v) {
|
|
||||||
// Canonicalize NaN ranges. I assume this check will be faster here than
|
|
||||||
// (v == v) == false on the FPU? We don't distinguish between different
|
|
||||||
// flavors of NaNs here (see http://en.wikipedia.org/wiki/NaN). I guess
|
|
||||||
// in Java this doesn't matter much anyway.
|
|
||||||
if ((v & 0x7fffffff) > 0x7f800000) {
|
|
||||||
// Apply the logic below to a canonical "quiet NaN"
|
|
||||||
return 0x7fc00000 ^ 0x80000000;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (v < 0) {
|
|
||||||
// Reverse the order of negative values and push them before positive values.
|
|
||||||
return ~v;
|
|
||||||
} else {
|
|
||||||
// Shift positive values after negative, but before NaNs, they're sorted already.
|
|
||||||
return v ^ 0x80000000;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Back from {@link #floatBitsToUnsignedOrdered(int)}.
|
|
||||||
*/
|
|
||||||
static int unsignedOrderedToFloatBits(int v) {
|
|
||||||
if (v < 0)
|
|
||||||
return v & ~0x80000000;
|
|
||||||
else
|
|
||||||
return ~v;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -17,29 +17,40 @@ package org.apache.lucene.search.suggest.fst;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
import org.apache.lucene.search.suggest.BytesRefList;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefIterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An {@link BytesRefSorter} that keeps all the entries in memory.
|
* An {@link BytesRefSorter} that keeps all the entries in memory.
|
||||||
|
* @lucene.experimental
|
||||||
|
* @lucene.internal
|
||||||
*/
|
*/
|
||||||
public final class InMemorySorter implements BytesRefSorter {
|
public final class InMemorySorter implements BytesRefSorter {
|
||||||
// TODO: use a single byte[] to back up all entries?
|
private final BytesRefList buffer = new BytesRefList();
|
||||||
private final ArrayList<BytesRef> refs = new ArrayList<BytesRef>();
|
|
||||||
|
|
||||||
private boolean closed = false;
|
private boolean closed = false;
|
||||||
|
private final Comparator<BytesRef> comparator;
|
||||||
|
|
||||||
|
public InMemorySorter(Comparator<BytesRef> comparator) {
|
||||||
|
this.comparator = comparator;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void add(BytesRef utf8) {
|
public void add(BytesRef utf8) {
|
||||||
if (closed) throw new IllegalStateException();
|
if (closed) throw new IllegalStateException();
|
||||||
refs.add(BytesRef.deepCopyOf(utf8));
|
buffer.append(utf8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<BytesRef> iterator() {
|
public BytesRefIterator iterator() {
|
||||||
closed = true;
|
closed = true;
|
||||||
Collections.sort(refs, BytesRef.getUTF8SortedAsUnicodeComparator());
|
return buffer.iterator(comparator);
|
||||||
return Collections.unmodifiableCollection(refs).iterator();
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Comparator<BytesRef> getComparator() {
|
||||||
|
return comparator;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,15 +20,10 @@ package org.apache.lucene.search.suggest.fst;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.lucene.search.suggest.BytesRefList;
|
||||||
import org.apache.lucene.util.*;
|
import org.apache.lucene.util.*;
|
||||||
import org.apache.lucene.util.PriorityQueue;
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
|
||||||
// TODO: the buffer is currently byte[][] which with very small arrays will terribly overallocate
|
|
||||||
// memory (alignments) and make GC very happy.
|
|
||||||
//
|
|
||||||
// We could move it to a single byte[] + and use custom sorting, but we'd need to check if this
|
|
||||||
// yields any improvement first.
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following
|
* On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following
|
||||||
* fields:
|
* fields:
|
||||||
|
@ -38,6 +33,8 @@ import org.apache.lucene.util.PriorityQueue;
|
||||||
* </ul>
|
* </ul>
|
||||||
*
|
*
|
||||||
* @see #sort(File, File)
|
* @see #sort(File, File)
|
||||||
|
* @lucene.experimental
|
||||||
|
* @lucene.internal
|
||||||
*/
|
*/
|
||||||
public final class Sort {
|
public final class Sort {
|
||||||
public final static int MB = 1024 * 1024;
|
public final static int MB = 1024 * 1024;
|
||||||
|
@ -59,11 +56,6 @@ public final class Sort {
|
||||||
*/
|
*/
|
||||||
public final static int MAX_TEMPFILES = 128;
|
public final static int MAX_TEMPFILES = 128;
|
||||||
|
|
||||||
/**
|
|
||||||
* Minimum slot buffer expansion.
|
|
||||||
*/
|
|
||||||
private final static int MIN_EXPECTED_GROWTH = 1000;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A bit more descriptive unit for constructors.
|
* A bit more descriptive unit for constructors.
|
||||||
*
|
*
|
||||||
|
@ -111,21 +103,6 @@ public final class Sort {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* byte[] in unsigned byte order.
|
|
||||||
*/
|
|
||||||
static final Comparator<byte[]> unsignedByteOrderComparator = new Comparator<byte[]>() {
|
|
||||||
public int compare(byte[] left, byte[] right) {
|
|
||||||
final int max = Math.min(left.length, right.length);
|
|
||||||
for (int i = 0, j = 0; i < max; i++, j++) {
|
|
||||||
int diff = (left[i] & 0xff) - (right[j] & 0xff);
|
|
||||||
if (diff != 0)
|
|
||||||
return diff;
|
|
||||||
}
|
|
||||||
return left.length - right.length;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sort info (debugging mostly).
|
* Sort info (debugging mostly).
|
||||||
*/
|
*/
|
||||||
|
@ -149,14 +126,15 @@ public final class Sort {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private final static byte [][] EMPTY = new byte [0][];
|
|
||||||
|
|
||||||
private final BufferSize ramBufferSize;
|
private final BufferSize ramBufferSize;
|
||||||
private final File tempDirectory;
|
private final File tempDirectory;
|
||||||
|
|
||||||
private byte [][] buffer = new byte [0][];
|
private final BytesRefList buffer = new BytesRefList();
|
||||||
private SortInfo sortInfo;
|
private SortInfo sortInfo;
|
||||||
private int maxTempFiles;
|
private int maxTempFiles;
|
||||||
|
private final Comparator<BytesRef> comparator;
|
||||||
|
|
||||||
|
public static final Comparator<BytesRef> DEFAULT_COMPARATOR = BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Defaults constructor.
|
* Defaults constructor.
|
||||||
|
@ -165,13 +143,17 @@ public final class Sort {
|
||||||
* @see BufferSize#automatic()
|
* @see BufferSize#automatic()
|
||||||
*/
|
*/
|
||||||
public Sort() throws IOException {
|
public Sort() throws IOException {
|
||||||
this(BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
|
this(DEFAULT_COMPARATOR, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Sort(Comparator<BytesRef> comparator) throws IOException {
|
||||||
|
this(comparator, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* All-details constructor.
|
* All-details constructor.
|
||||||
*/
|
*/
|
||||||
public Sort(BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) {
|
public Sort(Comparator<BytesRef> comparator, BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) {
|
||||||
if (ramBufferSize.bytes < ABSOLUTE_MIN_SORT_BUFFER_SIZE) {
|
if (ramBufferSize.bytes < ABSOLUTE_MIN_SORT_BUFFER_SIZE) {
|
||||||
throw new IllegalArgumentException(MIN_BUFFER_SIZE_MSG + ": " + ramBufferSize.bytes);
|
throw new IllegalArgumentException(MIN_BUFFER_SIZE_MSG + ": " + ramBufferSize.bytes);
|
||||||
}
|
}
|
||||||
|
@ -183,6 +165,7 @@ public final class Sort {
|
||||||
this.ramBufferSize = ramBufferSize;
|
this.ramBufferSize = ramBufferSize;
|
||||||
this.tempDirectory = tempDirectory;
|
this.tempDirectory = tempDirectory;
|
||||||
this.maxTempFiles = maxTempfiles;
|
this.maxTempFiles = maxTempfiles;
|
||||||
|
this.comparator = comparator;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -283,23 +266,25 @@ public final class Sort {
|
||||||
|
|
||||||
/** Sort a single partition in-memory. */
|
/** Sort a single partition in-memory. */
|
||||||
protected File sortPartition(int len) throws IOException {
|
protected File sortPartition(int len) throws IOException {
|
||||||
byte [][] data = this.buffer;
|
BytesRefList data = this.buffer;
|
||||||
File tempFile = File.createTempFile("sort", "partition", tempDirectory);
|
File tempFile = File.createTempFile("sort", "partition", tempDirectory);
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
Arrays.sort(data, 0, len, unsignedByteOrderComparator);
|
|
||||||
sortInfo.sortTime += (System.currentTimeMillis() - start);
|
sortInfo.sortTime += (System.currentTimeMillis() - start);
|
||||||
|
|
||||||
ByteSequencesWriter out = new ByteSequencesWriter(tempFile);
|
final ByteSequencesWriter out = new ByteSequencesWriter(tempFile);
|
||||||
|
BytesRef spare;
|
||||||
try {
|
try {
|
||||||
for (int i = 0; i < len; i++) {
|
BytesRefIterator iter = buffer.iterator(comparator);
|
||||||
assert data[i].length <= Short.MAX_VALUE;
|
while((spare = iter.next()) != null) {
|
||||||
out.write(data[i]);
|
assert spare.length <= Short.MAX_VALUE;
|
||||||
|
out.write(spare);
|
||||||
}
|
}
|
||||||
|
|
||||||
out.close();
|
out.close();
|
||||||
|
|
||||||
// Clean up the buffer for the next partition.
|
// Clean up the buffer for the next partition.
|
||||||
this.buffer = EMPTY;
|
data.clear();
|
||||||
return tempFile;
|
return tempFile;
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.close(out);
|
IOUtils.close(out);
|
||||||
|
@ -314,7 +299,7 @@ public final class Sort {
|
||||||
|
|
||||||
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(merges.size()) {
|
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(merges.size()) {
|
||||||
protected boolean lessThan(FileAndTop a, FileAndTop b) {
|
protected boolean lessThan(FileAndTop a, FileAndTop b) {
|
||||||
return a.current.compareTo(b.current) < 0;
|
return comparator.compare(a.current, b.current) < 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -359,33 +344,18 @@ public final class Sort {
|
||||||
/** Read in a single partition of data */
|
/** Read in a single partition of data */
|
||||||
int readPartition(ByteSequencesReader reader) throws IOException {
|
int readPartition(ByteSequencesReader reader) throws IOException {
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
|
final BytesRef scratch = new BytesRef();
|
||||||
// We will be reallocating from scratch.
|
while ((scratch.bytes = reader.read()) != null) {
|
||||||
Arrays.fill(this.buffer, null);
|
scratch.length = scratch.bytes.length;
|
||||||
|
buffer.append(scratch);
|
||||||
int bytesLimit = this.ramBufferSize.bytes;
|
|
||||||
byte [][] data = this.buffer;
|
|
||||||
byte[] line;
|
|
||||||
int linesRead = 0;
|
|
||||||
while ((line = reader.read()) != null) {
|
|
||||||
if (linesRead + 1 >= data.length) {
|
|
||||||
data = Arrays.copyOf(data,
|
|
||||||
ArrayUtil.oversize(linesRead + MIN_EXPECTED_GROWTH,
|
|
||||||
RamUsageEstimator.NUM_BYTES_OBJECT_REF));
|
|
||||||
}
|
|
||||||
data[linesRead++] = line;
|
|
||||||
|
|
||||||
// Account for the created objects.
|
// Account for the created objects.
|
||||||
// (buffer slots do not account to buffer size.)
|
// (buffer slots do not account to buffer size.)
|
||||||
bytesLimit -= line.length + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER;
|
if (ramBufferSize.bytes < buffer.bytesUsed()) {
|
||||||
if (bytesLimit < 0) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.buffer = data;
|
|
||||||
|
|
||||||
sortInfo.readTime += (System.currentTimeMillis() - start);
|
sortInfo.readTime += (System.currentTimeMillis() - start);
|
||||||
return linesRead;
|
return buffer.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
static class FileAndTop {
|
static class FileAndTop {
|
||||||
|
@ -515,5 +485,9 @@ public final class Sort {
|
||||||
((Closeable) is).close();
|
((Closeable) is).close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Comparator<BytesRef> getComparator() {
|
||||||
|
return comparator;
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -28,6 +28,8 @@ import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||||
import org.apache.lucene.search.suggest.Lookup;
|
import org.apache.lucene.search.suggest.Lookup;
|
||||||
|
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
|
||||||
|
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
|
||||||
import org.apache.lucene.store.ByteArrayDataInput;
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||||
import org.apache.lucene.store.InputStreamDataInput;
|
import org.apache.lucene.store.InputStreamDataInput;
|
||||||
|
@ -102,72 +104,27 @@ public class WFSTCompletionLookup extends Lookup {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void build(TermFreqIterator iterator) throws IOException {
|
public void build(TermFreqIterator iterator) throws IOException {
|
||||||
String prefix = getClass().getSimpleName();
|
|
||||||
File directory = Sort.defaultTempDir();
|
|
||||||
File tempInput = File.createTempFile(prefix, ".input", directory);
|
|
||||||
File tempSorted = File.createTempFile(prefix, ".sorted", directory);
|
|
||||||
|
|
||||||
Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
|
|
||||||
Sort.ByteSequencesReader reader = null;
|
|
||||||
BytesRef scratch = new BytesRef();
|
BytesRef scratch = new BytesRef();
|
||||||
|
TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator,
|
||||||
boolean success = false;
|
BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||||
try {
|
IntsRef scratchInts = new IntsRef();
|
||||||
byte [] buffer = new byte [0];
|
BytesRef previous = null;
|
||||||
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
|
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
||||||
BytesRef spare;
|
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
while ((spare = iterator.next()) != null) {
|
while ((scratch = iter.next()) != null) {
|
||||||
if (spare.length + 5 >= buffer.length) {
|
long cost = iter.weight();
|
||||||
buffer = ArrayUtil.grow(buffer, spare.length + 5);
|
|
||||||
}
|
|
||||||
|
|
||||||
output.reset(buffer);
|
|
||||||
output.writeBytes(spare.bytes, spare.offset, spare.length);
|
|
||||||
output.writeByte((byte)0); // separator: not used, just for sort order
|
|
||||||
output.writeInt((int)encodeWeight(iterator.weight()));
|
|
||||||
writer.write(buffer, 0, output.getPosition());
|
|
||||||
}
|
|
||||||
writer.close();
|
|
||||||
new Sort().sort(tempInput, tempSorted);
|
|
||||||
reader = new Sort.ByteSequencesReader(tempSorted);
|
|
||||||
|
|
||||||
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
|
if (previous == null) {
|
||||||
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
|
previous = new BytesRef();
|
||||||
|
} else if (scratch.equals(previous)) {
|
||||||
BytesRef previous = null;
|
continue; // for duplicate suggestions, the best weight is actually
|
||||||
BytesRef suggestion = new BytesRef();
|
// added
|
||||||
IntsRef scratchInts = new IntsRef();
|
|
||||||
ByteArrayDataInput input = new ByteArrayDataInput();
|
|
||||||
while (reader.read(scratch)) {
|
|
||||||
suggestion.bytes = scratch.bytes;
|
|
||||||
suggestion.offset = scratch.offset;
|
|
||||||
suggestion.length = scratch.length - 5; // int + separator
|
|
||||||
|
|
||||||
input.reset(scratch.bytes);
|
|
||||||
input.skipBytes(suggestion.length + 1); // suggestion + separator
|
|
||||||
long cost = input.readInt();
|
|
||||||
|
|
||||||
if (previous == null) {
|
|
||||||
previous = new BytesRef();
|
|
||||||
} else if (suggestion.equals(previous)) {
|
|
||||||
continue; // for duplicate suggestions, the best weight is actually added
|
|
||||||
}
|
|
||||||
Util.toIntsRef(suggestion, scratchInts);
|
|
||||||
builder.add(scratchInts, cost);
|
|
||||||
previous.copyBytes(suggestion);
|
|
||||||
}
|
}
|
||||||
fst = builder.finish();
|
Util.toIntsRef(scratch, scratchInts);
|
||||||
success = true;
|
builder.add(scratchInts, cost);
|
||||||
} finally {
|
previous.copyBytes(scratch);
|
||||||
if (success) {
|
|
||||||
IOUtils.close(reader, writer);
|
|
||||||
} else {
|
|
||||||
IOUtils.closeWhileHandlingException(reader, writer);
|
|
||||||
}
|
|
||||||
|
|
||||||
tempInput.delete();
|
|
||||||
tempSorted.delete();
|
|
||||||
}
|
}
|
||||||
|
fst = builder.finish();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -270,16 +227,10 @@ public class WFSTCompletionLookup extends Lookup {
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean add(CharSequence key, Object value) {
|
|
||||||
return false; // Not supported.
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the weight associated with an input string,
|
* Returns the weight associated with an input string,
|
||||||
* or null if it does not exist.
|
* or null if it does not exist.
|
||||||
*/
|
*/
|
||||||
@Override
|
|
||||||
public Object get(CharSequence key) {
|
public Object get(CharSequence key) {
|
||||||
Arc<Long> arc = new Arc<Long>();
|
Arc<Long> arc = new Arc<Long>();
|
||||||
Long result = null;
|
Long result = null;
|
||||||
|
@ -289,23 +240,51 @@ public class WFSTCompletionLookup extends Lookup {
|
||||||
if (result == null || !arc.isFinal()) {
|
if (result == null || !arc.isFinal()) {
|
||||||
return null;
|
return null;
|
||||||
} else {
|
} else {
|
||||||
return decodeWeight(result + arc.nextFinalOutput);
|
return Integer.valueOf(decodeWeight(result + arc.nextFinalOutput));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** cost -> weight */
|
/** cost -> weight */
|
||||||
private static float decodeWeight(long encoded) {
|
private static int decodeWeight(long encoded) {
|
||||||
return Integer.MAX_VALUE - encoded;
|
return (int)(Integer.MAX_VALUE - encoded);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** weight -> cost */
|
/** weight -> cost */
|
||||||
private static long encodeWeight(float value) {
|
private static int encodeWeight(long value) {
|
||||||
if (Float.isNaN(value) || Float.isInfinite(value) || value < 0 || value > Integer.MAX_VALUE) {
|
if (value < 0 || value > Integer.MAX_VALUE) {
|
||||||
throw new UnsupportedOperationException("cannot encode value: " + value);
|
throw new UnsupportedOperationException("cannot encode value: " + value);
|
||||||
}
|
}
|
||||||
return Integer.MAX_VALUE - (int)value;
|
return Integer.MAX_VALUE - (int)value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final class WFSTTermFreqIteratorWrapper extends SortedTermFreqIteratorWrapper {
|
||||||
|
|
||||||
|
WFSTTermFreqIteratorWrapper(TermFreqIterator source,
|
||||||
|
Comparator<BytesRef> comparator) throws IOException {
|
||||||
|
super(source, comparator, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException {
|
||||||
|
if (spare.length + 5 >= buffer.length) {
|
||||||
|
buffer = ArrayUtil.grow(buffer, spare.length + 5);
|
||||||
|
}
|
||||||
|
output.reset(buffer);
|
||||||
|
output.writeBytes(spare.bytes, spare.offset, spare.length);
|
||||||
|
output.writeByte((byte)0); // separator: not used, just for sort order
|
||||||
|
output.writeInt(encodeWeight(weight));
|
||||||
|
writer.write(buffer, 0, output.getPosition());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
|
||||||
|
tmpInput.reset(scratch.bytes);
|
||||||
|
tmpInput.skipBytes(scratch.length - 4); // suggestion + separator
|
||||||
|
scratch.length -= 5; // sep + long
|
||||||
|
return tmpInput.readInt();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static final Comparator<Long> weightComparator = new Comparator<Long> () {
|
static final Comparator<Long> weightComparator = new Comparator<Long> () {
|
||||||
public int compare(Long left, Long right) {
|
public int compare(Long left, Long right) {
|
||||||
return left.compareTo(right);
|
return left.compareTo(right);
|
||||||
|
|
|
@ -55,24 +55,22 @@ public class JaspellLookup extends Lookup {
|
||||||
final CharsRef charsSpare = new CharsRef();
|
final CharsRef charsSpare = new CharsRef();
|
||||||
|
|
||||||
while ((spare = tfit.next()) != null) {
|
while ((spare = tfit.next()) != null) {
|
||||||
float freq = tfit.weight();
|
final long weight = tfit.weight();
|
||||||
if (spare.length == 0) {
|
if (spare.length == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
charsSpare.grow(spare.length);
|
charsSpare.grow(spare.length);
|
||||||
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
|
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
|
||||||
trie.put(charsSpare.toString(), new Float(freq));
|
trie.put(charsSpare.toString(), Long.valueOf(weight));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean add(CharSequence key, Object value) {
|
public boolean add(CharSequence key, Object value) {
|
||||||
trie.put(key, value);
|
trie.put(key, value);
|
||||||
// XXX
|
// XXX
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Object get(CharSequence key) {
|
public Object get(CharSequence key) {
|
||||||
return trie.get(key);
|
return trie.get(key);
|
||||||
}
|
}
|
||||||
|
@ -95,7 +93,7 @@ public class JaspellLookup extends Lookup {
|
||||||
if (onlyMorePopular) {
|
if (onlyMorePopular) {
|
||||||
LookupPriorityQueue queue = new LookupPriorityQueue(num);
|
LookupPriorityQueue queue = new LookupPriorityQueue(num);
|
||||||
for (String s : list) {
|
for (String s : list) {
|
||||||
float freq = (Float)trie.get(s);
|
long freq = ((Number)trie.get(s)).longValue();
|
||||||
queue.insertWithOverflow(new LookupResult(new CharsRef(s), freq));
|
queue.insertWithOverflow(new LookupResult(new CharsRef(s), freq));
|
||||||
}
|
}
|
||||||
for (LookupResult lr : queue.getResults()) {
|
for (LookupResult lr : queue.getResults()) {
|
||||||
|
@ -104,7 +102,7 @@ public class JaspellLookup extends Lookup {
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < maxCnt; i++) {
|
for (int i = 0; i < maxCnt; i++) {
|
||||||
String s = list.get(i);
|
String s = list.get(i);
|
||||||
float freq = (Float)trie.get(s);
|
long freq = ((Number)trie.get(s)).longValue();
|
||||||
res.add(new LookupResult(new CharsRef(s), freq));
|
res.add(new LookupResult(new CharsRef(s), freq));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -131,7 +129,7 @@ public class JaspellLookup extends Lookup {
|
||||||
node.splitchar = in.readChar();
|
node.splitchar = in.readChar();
|
||||||
byte mask = in.readByte();
|
byte mask = in.readByte();
|
||||||
if ((mask & HAS_VALUE) != 0) {
|
if ((mask & HAS_VALUE) != 0) {
|
||||||
node.data = new Float(in.readFloat());
|
node.data = Long.valueOf(in.readLong());
|
||||||
}
|
}
|
||||||
if ((mask & LO_KID) != 0) {
|
if ((mask & LO_KID) != 0) {
|
||||||
TSTNode kid = trie.new TSTNode('\0', node);
|
TSTNode kid = trie.new TSTNode('\0', node);
|
||||||
|
@ -171,7 +169,7 @@ public class JaspellLookup extends Lookup {
|
||||||
if (node.data != null) mask |= HAS_VALUE;
|
if (node.data != null) mask |= HAS_VALUE;
|
||||||
out.writeByte(mask);
|
out.writeByte(mask);
|
||||||
if (node.data != null) {
|
if (node.data != null) {
|
||||||
out.writeFloat((Float)node.data);
|
out.writeLong(((Number)node.data).longValue());
|
||||||
}
|
}
|
||||||
writeRecursively(out, node.relatives[TSTNode.LOKID]);
|
writeRecursively(out, node.relatives[TSTNode.LOKID]);
|
||||||
writeRecursively(out, node.relatives[TSTNode.EQKID]);
|
writeRecursively(out, node.relatives[TSTNode.EQKID]);
|
||||||
|
|
|
@ -50,26 +50,24 @@ public class TSTLookup extends Lookup {
|
||||||
}
|
}
|
||||||
|
|
||||||
ArrayList<String> tokens = new ArrayList<String>();
|
ArrayList<String> tokens = new ArrayList<String>();
|
||||||
ArrayList<Float> vals = new ArrayList<Float>();
|
ArrayList<Number> vals = new ArrayList<Number>();
|
||||||
BytesRef spare;
|
BytesRef spare;
|
||||||
CharsRef charsSpare = new CharsRef();
|
CharsRef charsSpare = new CharsRef();
|
||||||
while ((spare = tfit.next()) != null) {
|
while ((spare = tfit.next()) != null) {
|
||||||
charsSpare.grow(spare.length);
|
charsSpare.grow(spare.length);
|
||||||
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
|
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
|
||||||
tokens.add(charsSpare.toString());
|
tokens.add(charsSpare.toString());
|
||||||
vals.add(new Float(tfit.weight()));
|
vals.add(Long.valueOf(tfit.weight()));
|
||||||
}
|
}
|
||||||
autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
|
autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean add(CharSequence key, Object value) {
|
public boolean add(CharSequence key, Object value) {
|
||||||
autocomplete.insert(root, key, value, 0);
|
autocomplete.insert(root, key, value, 0);
|
||||||
// XXX we don't know if a new node was created
|
// XXX we don't know if a new node was created
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Object get(CharSequence key) {
|
public Object get(CharSequence key) {
|
||||||
List<TernaryTreeNode> list = autocomplete.prefixCompletion(root, key, 0);
|
List<TernaryTreeNode> list = autocomplete.prefixCompletion(root, key, 0);
|
||||||
if (list == null || list.isEmpty()) {
|
if (list == null || list.isEmpty()) {
|
||||||
|
@ -107,7 +105,7 @@ public class TSTLookup extends Lookup {
|
||||||
if (onlyMorePopular) {
|
if (onlyMorePopular) {
|
||||||
LookupPriorityQueue queue = new LookupPriorityQueue(num);
|
LookupPriorityQueue queue = new LookupPriorityQueue(num);
|
||||||
for (TernaryTreeNode ttn : list) {
|
for (TernaryTreeNode ttn : list) {
|
||||||
queue.insertWithOverflow(new LookupResult(ttn.token, (Float)ttn.val));
|
queue.insertWithOverflow(new LookupResult(ttn.token, ((Number)ttn.val).longValue()));
|
||||||
}
|
}
|
||||||
for (LookupResult lr : queue.getResults()) {
|
for (LookupResult lr : queue.getResults()) {
|
||||||
res.add(lr);
|
res.add(lr);
|
||||||
|
@ -115,7 +113,7 @@ public class TSTLookup extends Lookup {
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < maxCnt; i++) {
|
for (int i = 0; i < maxCnt; i++) {
|
||||||
TernaryTreeNode ttn = list.get(i);
|
TernaryTreeNode ttn = list.get(i);
|
||||||
res.add(new LookupResult(ttn.token, (Float)ttn.val));
|
res.add(new LookupResult(ttn.token, ((Number)ttn.val).longValue()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
|
@ -146,7 +144,7 @@ public class TSTLookup extends Lookup {
|
||||||
node.token = in.readUTF();
|
node.token = in.readUTF();
|
||||||
}
|
}
|
||||||
if ((mask & HAS_VALUE) != 0) {
|
if ((mask & HAS_VALUE) != 0) {
|
||||||
node.val = new Float(in.readFloat());
|
node.val = Long.valueOf(in.readLong());
|
||||||
}
|
}
|
||||||
if ((mask & LO_KID) != 0) {
|
if ((mask & LO_KID) != 0) {
|
||||||
node.loKid = new TernaryTreeNode();
|
node.loKid = new TernaryTreeNode();
|
||||||
|
@ -184,7 +182,7 @@ public class TSTLookup extends Lookup {
|
||||||
if (node.val != null) mask |= HAS_VALUE;
|
if (node.val != null) mask |= HAS_VALUE;
|
||||||
out.writeByte(mask);
|
out.writeByte(mask);
|
||||||
if (node.token != null) out.writeUTF(node.token);
|
if (node.token != null) out.writeUTF(node.token);
|
||||||
if (node.val != null) out.writeFloat((Float)node.val);
|
if (node.val != null) out.writeLong(((Number)node.val).longValue());
|
||||||
// recurse and write kids
|
// recurse and write kids
|
||||||
if (node.loKid != null) {
|
if (node.loKid != null) {
|
||||||
writeRecursively(out, node.loKid);
|
writeRecursively(out, node.loKid);
|
||||||
|
|
|
@ -17,8 +17,10 @@
|
||||||
package org.apache.lucene.search.suggest;
|
package org.apache.lucene.search.suggest;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.search.suggest.Lookup;
|
import org.apache.lucene.search.suggest.Lookup;
|
||||||
|
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
||||||
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
|
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
|
||||||
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
|
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
|
||||||
import org.apache.lucene.search.suggest.tst.TSTLookup;
|
import org.apache.lucene.search.suggest.tst.TSTLookup;
|
||||||
|
@ -74,16 +76,18 @@ public class PersistenceTest extends LuceneTestCase {
|
||||||
lookup.load(storeDir);
|
lookup.load(storeDir);
|
||||||
|
|
||||||
// Assert validity.
|
// Assert validity.
|
||||||
float previous = Float.NEGATIVE_INFINITY;
|
long previous = Long.MIN_VALUE;
|
||||||
for (TermFreq k : keys) {
|
for (TermFreq k : keys) {
|
||||||
Float val = (Float) lookup.get(_TestUtil.bytesToCharSequence(k.term, random));
|
List<LookupResult> list = lookup.lookup(_TestUtil.bytesToCharSequence(k.term, random), false, 1);
|
||||||
assertNotNull(k.term.utf8ToString(), val);
|
assertEquals(1, list.size());
|
||||||
|
LookupResult lookupResult = list.get(0);
|
||||||
|
assertNotNull(k.term.utf8ToString(), lookupResult.key);
|
||||||
|
|
||||||
if (supportsExactWeights) {
|
if (supportsExactWeights) {
|
||||||
assertEquals(k.term.utf8ToString(), Float.valueOf(k.v), val);
|
assertEquals(k.term.utf8ToString(), k.v, lookupResult.value);
|
||||||
} else {
|
} else {
|
||||||
assertTrue(val + ">=" + previous, val >= previous);
|
assertTrue(lookupResult.value + ">=" + previous, lookupResult.value >= previous);
|
||||||
previous = val.floatValue();
|
previous = lookupResult.value;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,59 +29,79 @@ import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
public class TestBytesRefList extends LuceneTestCase {
|
public class TestBytesRefList extends LuceneTestCase {
|
||||||
|
|
||||||
public void testAppend() throws IOException {
|
public void testAppend() throws IOException {
|
||||||
BytesRefList list = new BytesRefList();
|
BytesRefList list = new BytesRefList();
|
||||||
List<String> stringList = new ArrayList<String>();
|
List<String> stringList = new ArrayList<String>();
|
||||||
int entries = atLeast(500);
|
for (int j = 0; j < 2; j++) {
|
||||||
BytesRef spare = new BytesRef();
|
if (j > 0 && random.nextBoolean()) {
|
||||||
for (int i = 0; i < entries; i++) {
|
list.clear();
|
||||||
String randomRealisticUnicodeString = _TestUtil
|
stringList.clear();
|
||||||
.randomRealisticUnicodeString(random);
|
}
|
||||||
spare.copyChars(randomRealisticUnicodeString);
|
int entries = atLeast(500);
|
||||||
list.append(spare);
|
BytesRef spare = new BytesRef();
|
||||||
stringList.add(randomRealisticUnicodeString);
|
for (int i = 0; i < entries; i++) {
|
||||||
}
|
String randomRealisticUnicodeString = _TestUtil
|
||||||
for (int i = 0; i < entries; i++) {
|
.randomRealisticUnicodeString(random);
|
||||||
assertNotNull(list.get(spare, i));
|
spare.copyChars(randomRealisticUnicodeString);
|
||||||
assertEquals("entry " + i + " doesn't match", stringList.get(i),
|
list.append(spare);
|
||||||
spare.utf8ToString());
|
stringList.add(randomRealisticUnicodeString);
|
||||||
}
|
}
|
||||||
|
for (int i = 0; i < entries; i++) {
|
||||||
// check random
|
assertNotNull(list.get(spare, i));
|
||||||
for (int i = 0; i < entries; i++) {
|
assertEquals("entry " + i + " doesn't match", stringList.get(i),
|
||||||
int e = random.nextInt(entries);
|
spare.utf8ToString());
|
||||||
assertNotNull(list.get(spare, e));
|
}
|
||||||
assertEquals("entry " + i + " doesn't match", stringList.get(e),
|
|
||||||
spare.utf8ToString());
|
// check random
|
||||||
}
|
for (int i = 0; i < entries; i++) {
|
||||||
for (int i = 0; i < 2; i++) {
|
int e = random.nextInt(entries);
|
||||||
|
assertNotNull(list.get(spare, e));
|
||||||
BytesRefIterator iterator = list.iterator();
|
assertEquals("entry " + i + " doesn't match", stringList.get(e),
|
||||||
for (String string : stringList) {
|
spare.utf8ToString());
|
||||||
assertEquals(string, iterator.next().utf8ToString());
|
}
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
|
||||||
|
BytesRefIterator iterator = list.iterator();
|
||||||
|
for (String string : stringList) {
|
||||||
|
assertEquals(string, iterator.next().utf8ToString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSort() {
|
public void testSort() throws IOException {
|
||||||
BytesRefList list = new BytesRefList();
|
BytesRefList list = new BytesRefList();
|
||||||
List<String> stringList = new ArrayList<String>();
|
List<String> stringList = new ArrayList<String>();
|
||||||
int entries = atLeast(500);
|
|
||||||
BytesRef spare = new BytesRef();
|
for (int j = 0; j < 2; j++) {
|
||||||
for (int i = 0; i < entries; i++) {
|
if (j > 0 && random.nextBoolean()) {
|
||||||
String randomRealisticUnicodeString = _TestUtil.randomRealisticUnicodeString(random);
|
list.clear();
|
||||||
spare.copyChars(randomRealisticUnicodeString);
|
stringList.clear();
|
||||||
list.append(spare);
|
}
|
||||||
stringList.add(randomRealisticUnicodeString);
|
int entries = atLeast(500);
|
||||||
}
|
BytesRef spare = new BytesRef();
|
||||||
Collections.sort(stringList);
|
for (int i = 0; i < entries; i++) {
|
||||||
int[] sortedOrds = list.sort(BytesRef.getUTF8SortedAsUTF16Comparator());
|
String randomRealisticUnicodeString = _TestUtil
|
||||||
for (int i = 0; i < entries; i++) {
|
.randomRealisticUnicodeString(random);
|
||||||
assertNotNull(list.get(spare, sortedOrds[i]));
|
spare.copyChars(randomRealisticUnicodeString);
|
||||||
assertEquals("entry " + i + " doesn't match", stringList.get(i),
|
list.append(spare);
|
||||||
spare.utf8ToString());
|
stringList.add(randomRealisticUnicodeString);
|
||||||
|
}
|
||||||
|
|
||||||
|
Collections.sort(stringList);
|
||||||
|
BytesRefIterator iter = list.iterator(BytesRef
|
||||||
|
.getUTF8SortedAsUTF16Comparator());
|
||||||
|
int i = 0;
|
||||||
|
while ((spare = iter.next()) != null) {
|
||||||
|
assertEquals("entry " + i + " doesn't match", stringList.get(i),
|
||||||
|
spare.utf8ToString());
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
assertNull(iter.next());
|
||||||
|
assertEquals(i, stringList.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,12 +17,16 @@ package org.apache.lucene.search.suggest;
|
||||||
* the License.
|
* the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
|
||||||
import org.apache.lucene.search.spell.TermFreqIterator;
|
import org.apache.lucene.search.spell.TermFreqIterator;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefHash;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
@ -38,7 +42,8 @@ public class TestTermFreqIterator extends LuceneTestCase {
|
||||||
public void testTerms() throws Exception {
|
public void testTerms() throws Exception {
|
||||||
int num = atLeast(10000);
|
int num = atLeast(10000);
|
||||||
|
|
||||||
TreeMap<BytesRef,Long> sorted = new TreeMap<BytesRef,Long>();
|
Comparator<BytesRef> comparator = random.nextBoolean() ? BytesRef.getUTF8SortedAsUnicodeComparator() : BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||||
|
TreeMap<BytesRef,Long> sorted = new TreeMap<BytesRef,Long>(comparator);
|
||||||
TermFreq[] unsorted = new TermFreq[num];
|
TermFreq[] unsorted = new TermFreq[num];
|
||||||
|
|
||||||
for (int i = 0; i < num; i++) {
|
for (int i = 0; i < num; i++) {
|
||||||
|
@ -52,13 +57,13 @@ public class TestTermFreqIterator extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
// test the sorted iterator wrapper
|
// test the sorted iterator wrapper
|
||||||
TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), BytesRef.getUTF8SortedAsUnicodeComparator());
|
TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator);
|
||||||
Iterator<Map.Entry<BytesRef,Long>> expected = sorted.entrySet().iterator();
|
Iterator<Map.Entry<BytesRef,Long>> expected = sorted.entrySet().iterator();
|
||||||
while (expected.hasNext()) {
|
while (expected.hasNext()) {
|
||||||
Map.Entry<BytesRef,Long> entry = expected.next();
|
Map.Entry<BytesRef,Long> entry = expected.next();
|
||||||
|
|
||||||
assertEquals(entry.getKey(), wrapper.next());
|
assertEquals(entry.getKey(), wrapper.next());
|
||||||
assertEquals(entry.getValue().longValue(), wrapper.weight(), 0F);
|
assertEquals(entry.getValue().longValue(), wrapper.weight());
|
||||||
}
|
}
|
||||||
assertNull(wrapper.next());
|
assertNull(wrapper.next());
|
||||||
|
|
||||||
|
@ -72,4 +77,57 @@ public class TestTermFreqIterator extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
assertEquals(sorted, actual);
|
assertEquals(sorted, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testRaw() throws Exception {
|
||||||
|
int num = atLeast(10000);
|
||||||
|
|
||||||
|
Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||||
|
BytesRefHash sorted = new BytesRefHash();
|
||||||
|
TermFreq[] unsorted = new TermFreq[num];
|
||||||
|
byte[] buffer = new byte[0];
|
||||||
|
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
|
||||||
|
|
||||||
|
for (int i = 0; i < num; i++) {
|
||||||
|
BytesRef spare;
|
||||||
|
long weight;
|
||||||
|
do {
|
||||||
|
spare = new BytesRef(_TestUtil.randomUnicodeString(random));
|
||||||
|
if (spare.length + 8 >= buffer.length) {
|
||||||
|
buffer = ArrayUtil.grow(buffer, spare.length + 8);
|
||||||
|
}
|
||||||
|
output.reset(buffer);
|
||||||
|
output.writeBytes(spare.bytes, spare.offset, spare.length);
|
||||||
|
weight = random.nextLong();
|
||||||
|
output.writeLong(weight);
|
||||||
|
|
||||||
|
} while (sorted.add(new BytesRef(buffer, 0, output.getPosition())) < 0);
|
||||||
|
unsorted[i] = new TermFreq(spare, weight);
|
||||||
|
}
|
||||||
|
|
||||||
|
// test the sorted iterator wrapper
|
||||||
|
TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator, true);
|
||||||
|
int[] sort = sorted.sort(comparator);
|
||||||
|
int size = sorted.size();
|
||||||
|
BytesRef spare = new BytesRef();
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
sorted.get(sort[i], spare);
|
||||||
|
spare.length -= 8; // sub the long value
|
||||||
|
assertEquals(spare, wrapper.next());
|
||||||
|
spare.offset = spare.offset + spare.length;
|
||||||
|
spare.length = 8;
|
||||||
|
assertEquals(asLong(spare), wrapper.weight());
|
||||||
|
}
|
||||||
|
assertNull(wrapper.next());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long asLong(BytesRef b) {
|
||||||
|
return (((long) asIntInternal(b, b.offset) << 32) | asIntInternal(b,
|
||||||
|
b.offset + 4) & 0xFFFFFFFFL);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int asIntInternal(BytesRef b, int pos) {
|
||||||
|
return ((b.bytes[pos++] & 0xFF) << 24) | ((b.bytes[pos++] & 0xFF) << 16)
|
||||||
|
| ((b.bytes[pos++] & 0xFF) << 8) | (b.bytes[pos] & 0xFF);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,9 +17,8 @@ package org.apache.lucene.search.suggest.fst;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefIterator;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
@ -31,7 +30,7 @@ public class BytesRefSortersTest extends LuceneTestCase {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testInMemorySorter() throws Exception {
|
public void testInMemorySorter() throws Exception {
|
||||||
check(new InMemorySorter());
|
check(new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator()));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void check(BytesRefSorter sorter) throws Exception {
|
private void check(BytesRefSorter sorter) throws Exception {
|
||||||
|
@ -42,8 +41,8 @@ public class BytesRefSortersTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create two iterators and check that they're aligned with each other.
|
// Create two iterators and check that they're aligned with each other.
|
||||||
Iterator<BytesRef> i1 = sorter.iterator();
|
BytesRefIterator i1 = sorter.iterator();
|
||||||
Iterator<BytesRef> i2 = sorter.iterator();
|
BytesRefIterator i2 = sorter.iterator();
|
||||||
|
|
||||||
// Verify sorter contract.
|
// Verify sorter contract.
|
||||||
try {
|
try {
|
||||||
|
@ -52,10 +51,12 @@ public class BytesRefSortersTest extends LuceneTestCase {
|
||||||
} catch (IllegalStateException e) {
|
} catch (IllegalStateException e) {
|
||||||
// Expected.
|
// Expected.
|
||||||
}
|
}
|
||||||
|
BytesRef spare1;
|
||||||
while (i1.hasNext() && i2.hasNext()) {
|
BytesRef spare2;
|
||||||
assertEquals(i1.next(), i2.next());
|
while ((spare1 = i1.next()) != null && (spare2 = i2.next()) != null) {
|
||||||
|
assertEquals(spare1, spare2);
|
||||||
}
|
}
|
||||||
assertEquals(i1.hasNext(), i2.hasNext());
|
assertNull(i1.next());
|
||||||
|
assertNull(i2.next());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -165,9 +165,9 @@ public class FSTCompletionTest extends LuceneTestCase {
|
||||||
|
|
||||||
// All the weights were constant, so all returned buckets must be constant, whatever they
|
// All the weights were constant, so all returned buckets must be constant, whatever they
|
||||||
// are.
|
// are.
|
||||||
Float previous = null;
|
Long previous = null;
|
||||||
for (TermFreq tf : keys) {
|
for (TermFreq tf : keys) {
|
||||||
Float current = (Float)lookup.get(_TestUtil.bytesToCharSequence(tf.term, random));
|
Long current = ((Number)lookup.get(_TestUtil.bytesToCharSequence(tf.term, random))).longValue();
|
||||||
if (previous != null) {
|
if (previous != null) {
|
||||||
assertEquals(previous, current);
|
assertEquals(previous, current);
|
||||||
}
|
}
|
||||||
|
@ -181,7 +181,7 @@ public class FSTCompletionTest extends LuceneTestCase {
|
||||||
FSTCompletionLookup lookup = new FSTCompletionLookup();
|
FSTCompletionLookup lookup = new FSTCompletionLookup();
|
||||||
lookup.build(new TermFreqArrayIterator(input));
|
lookup.build(new TermFreqArrayIterator(input));
|
||||||
for (TermFreq tf : input) {
|
for (TermFreq tf : input) {
|
||||||
assertTrue("Not found: " + tf.term.toString(), lookup.get(_TestUtil.bytesToCharSequence(tf.term, random)) != null);
|
assertNotNull("Not found: " + tf.term.toString(), lookup.get(_TestUtil.bytesToCharSequence(tf.term, random)));
|
||||||
assertEquals(tf.term.utf8ToString(), lookup.lookup(_TestUtil.bytesToCharSequence(tf.term, random), true, 1).get(0).key.toString());
|
assertEquals(tf.term.utf8ToString(), lookup.lookup(_TestUtil.bytesToCharSequence(tf.term, random), true, 1).get(0).key.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,140 +0,0 @@
|
||||||
package org.apache.lucene.search.suggest.fst;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
import org.apache.lucene.util.NumericUtils;
|
|
||||||
import org.junit.Ignore;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
public class FloatMagicTest extends LuceneTestCase {
|
|
||||||
public void testFloatMagic() {
|
|
||||||
ArrayList<Float> floats = new ArrayList<Float>(Arrays.asList(
|
|
||||||
Float.intBitsToFloat(0x7f800001), // NaN (invalid combination).
|
|
||||||
Float.intBitsToFloat(0x7fffffff), // NaN (invalid combination).
|
|
||||||
Float.intBitsToFloat(0xff800001), // NaN (invalid combination).
|
|
||||||
Float.intBitsToFloat(0xffffffff), // NaN (invalid combination).
|
|
||||||
Float.POSITIVE_INFINITY,
|
|
||||||
Float.MAX_VALUE,
|
|
||||||
100f,
|
|
||||||
0f,
|
|
||||||
0.1f,
|
|
||||||
Float.MIN_VALUE,
|
|
||||||
Float.NaN,
|
|
||||||
-0.0f,
|
|
||||||
-Float.MIN_VALUE,
|
|
||||||
-0.1f,
|
|
||||||
-1f,
|
|
||||||
-10f,
|
|
||||||
Float.NEGATIVE_INFINITY));
|
|
||||||
|
|
||||||
// Sort them using juc.
|
|
||||||
Collections.sort(floats);
|
|
||||||
|
|
||||||
// Convert to sortable int4 representation (as long to have an unsigned sort).
|
|
||||||
long [] int4 = new long [floats.size()];
|
|
||||||
for (int i = 0; i < floats.size(); i++) {
|
|
||||||
int4[i] = FloatMagic.toSortable(floats.get(i)) & 0xffffffffL;
|
|
||||||
|
|
||||||
/*
|
|
||||||
System.out.println(
|
|
||||||
String.format("raw %8s sortable %8s %8s numutils %8s %s",
|
|
||||||
Integer.toHexString(Float.floatToRawIntBits(floats.get(i))),
|
|
||||||
Integer.toHexString(FloatMagic.toSortable(floats.get(i))),
|
|
||||||
Integer.toHexString(FloatMagic.unsignedOrderedToFloatBits(FloatMagic.toSortable(floats.get(i)))),
|
|
||||||
Integer.toHexString(NumericUtils.floatToSortableInt(floats.get(i))),
|
|
||||||
floats.get(i)));
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort and compare. Should be identical order.
|
|
||||||
Arrays.sort(int4);
|
|
||||||
ArrayList<Float> backFromFixed = new ArrayList<Float>();
|
|
||||||
for (int i = 0; i < int4.length; i++) {
|
|
||||||
backFromFixed.add(FloatMagic.fromSortable((int) int4[i]));
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
for (int i = 0; i < int4.length; i++) {
|
|
||||||
System.out.println(
|
|
||||||
floats.get(i) + " " + FloatMagic.fromSortable((int) int4[i]));
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
assertEquals(floats, backFromFixed);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Ignore("Once checked, valid forever?") @Test
|
|
||||||
public void testRoundTripFullRange() {
|
|
||||||
int i = 0;
|
|
||||||
do {
|
|
||||||
float f = Float.intBitsToFloat(i);
|
|
||||||
float f2 = FloatMagic.fromSortable(FloatMagic.toSortable(f));
|
|
||||||
|
|
||||||
if (!((Float.isNaN(f) && Float.isNaN(f2)) || f == f2)) {
|
|
||||||
throw new RuntimeException("! " + Integer.toHexString(i) + "> " + f + " " + f2);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((i & 0xffffff) == 0) {
|
|
||||||
System.out.println(Integer.toHexString(i));
|
|
||||||
}
|
|
||||||
|
|
||||||
i++;
|
|
||||||
} while (i != 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Ignore("Once checked, valid forever?") @Test
|
|
||||||
public void testIncreasingFullRange() {
|
|
||||||
// -infinity ... -0.0
|
|
||||||
for (int i = 0xff800000; i != 0x80000000; i--) {
|
|
||||||
checkSmaller(i, i - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// -0.0 +0.0
|
|
||||||
checkSmaller(0x80000000, 0);
|
|
||||||
|
|
||||||
// +0.0 ... +infinity
|
|
||||||
for (int i = 0; i != 0x7f800000; i++) {
|
|
||||||
checkSmaller(i, i + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// All other are NaNs and should be after positive infinity.
|
|
||||||
final long infinity = toSortableL(Float.POSITIVE_INFINITY);
|
|
||||||
for (int i = 0x7f800001; i != 0x7fffffff; i++) {
|
|
||||||
assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
|
|
||||||
}
|
|
||||||
for (int i = 0xff800001; i != 0xffffffff; i++) {
|
|
||||||
assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private long toSortableL(float f) {
|
|
||||||
return FloatMagic.toSortable(f) & 0xffffffffL;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void checkSmaller(int i1, int i2) {
|
|
||||||
float f1 = Float.intBitsToFloat(i1);
|
|
||||||
float f2 = Float.intBitsToFloat(i2);
|
|
||||||
if (f1 > f2) {
|
|
||||||
throw new AssertionError(f1 + " " + f2 + " " + i1 + " " + i2);
|
|
||||||
}
|
|
||||||
assertTrue(toSortableL(f1) < toSortableL(f2));
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.search.suggest.fst;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
import org.apache.lucene.search.suggest.fst.Sort.BufferSize;
|
import org.apache.lucene.search.suggest.fst.Sort.BufferSize;
|
||||||
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
|
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
|
||||||
|
@ -61,7 +62,7 @@ public class TestSort extends LuceneTestCase {
|
||||||
@Test
|
@Test
|
||||||
public void testIntermediateMerges() throws Exception {
|
public void testIntermediateMerges() throws Exception {
|
||||||
// Sort 20 mb worth of data with 1mb buffer, binary merging.
|
// Sort 20 mb worth of data with 1mb buffer, binary merging.
|
||||||
SortInfo info = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), 2),
|
SortInfo info = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), 2),
|
||||||
generateRandom(Sort.MB * 20));
|
generateRandom(Sort.MB * 20));
|
||||||
assertTrue(info.mergeRounds > 10);
|
assertTrue(info.mergeRounds > 10);
|
||||||
}
|
}
|
||||||
|
@ -69,7 +70,7 @@ public class TestSort extends LuceneTestCase {
|
||||||
@Test
|
@Test
|
||||||
public void testSmallRandom() throws Exception {
|
public void testSmallRandom() throws Exception {
|
||||||
// Sort 20 mb worth of data with 1mb buffer.
|
// Sort 20 mb worth of data with 1mb buffer.
|
||||||
SortInfo sortInfo = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
|
SortInfo sortInfo = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
|
||||||
generateRandom(Sort.MB * 20));
|
generateRandom(Sort.MB * 20));
|
||||||
assertEquals(1, sortInfo.mergeRounds);
|
assertEquals(1, sortInfo.mergeRounds);
|
||||||
}
|
}
|
||||||
|
@ -77,7 +78,7 @@ public class TestSort extends LuceneTestCase {
|
||||||
@Test @Nightly
|
@Test @Nightly
|
||||||
public void testLargerRandom() throws Exception {
|
public void testLargerRandom() throws Exception {
|
||||||
// Sort 100MB worth of data with 15mb buffer.
|
// Sort 100MB worth of data with 15mb buffer.
|
||||||
checkSort(new Sort(BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
|
checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
|
||||||
generateRandom(Sort.MB * 100));
|
generateRandom(Sort.MB * 100));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -92,14 +93,25 @@ public class TestSort extends LuceneTestCase {
|
||||||
byte [][] bytes = data.toArray(new byte[data.size()][]);
|
byte [][] bytes = data.toArray(new byte[data.size()][]);
|
||||||
return bytes;
|
return bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static final Comparator<byte[]> unsignedByteOrderComparator = new Comparator<byte[]>() {
|
||||||
|
public int compare(byte[] left, byte[] right) {
|
||||||
|
final int max = Math.min(left.length, right.length);
|
||||||
|
for (int i = 0, j = 0; i < max; i++, j++) {
|
||||||
|
int diff = (left[i] & 0xff) - (right[j] & 0xff);
|
||||||
|
if (diff != 0)
|
||||||
|
return diff;
|
||||||
|
}
|
||||||
|
return left.length - right.length;
|
||||||
|
}
|
||||||
|
};
|
||||||
/**
|
/**
|
||||||
* Check sorting data on an instance of {@link Sort}.
|
* Check sorting data on an instance of {@link Sort}.
|
||||||
*/
|
*/
|
||||||
private SortInfo checkSort(Sort sort, byte[][] data) throws IOException {
|
private SortInfo checkSort(Sort sort, byte[][] data) throws IOException {
|
||||||
File unsorted = writeAll("unsorted", data);
|
File unsorted = writeAll("unsorted", data);
|
||||||
|
|
||||||
Arrays.sort(data, Sort.unsignedByteOrderComparator);
|
Arrays.sort(data, unsignedByteOrderComparator);
|
||||||
File golden = writeAll("golden", data);
|
File golden = writeAll("golden", data);
|
||||||
|
|
||||||
File sorted = new File(tempDir, "sorted");
|
File sorted = new File(tempDir, "sorted");
|
||||||
|
|
|
@ -117,7 +117,7 @@ public class WFSTCompletionTest extends LuceneTestCase {
|
||||||
// TODO: could be faster... but its slowCompletor for a reason
|
// TODO: could be faster... but its slowCompletor for a reason
|
||||||
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
|
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
|
||||||
if (e.getKey().startsWith(prefix)) {
|
if (e.getKey().startsWith(prefix)) {
|
||||||
matches.add(new LookupResult(e.getKey(), (float)e.getValue().longValue()));
|
matches.add(new LookupResult(e.getKey(), e.getValue().longValue()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -153,11 +153,6 @@ public class Suggester extends SolrSpellChecker {
|
||||||
build(core, searcher);
|
build(core, searcher);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void add(CharsRef query, int numHits) {
|
|
||||||
LOG.info("add " + query + ", " + numHits);
|
|
||||||
lookup.add(query, new Integer(numHits));
|
|
||||||
}
|
|
||||||
|
|
||||||
static SpellingResult EMPTY_RESULT = new SpellingResult();
|
static SpellingResult EMPTY_RESULT = new SpellingResult();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -182,7 +177,7 @@ public class Suggester extends SolrSpellChecker {
|
||||||
Collections.sort(suggestions);
|
Collections.sort(suggestions);
|
||||||
}
|
}
|
||||||
for (LookupResult lr : suggestions) {
|
for (LookupResult lr : suggestions) {
|
||||||
res.add(t, lr.key.toString(), ((Number)lr.value).intValue());
|
res.add(t, lr.key.toString(), (int)lr.value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
|
|
Loading…
Reference in New Issue