LUCENE-3807: Cleanup Suggest / Lookup API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1296268 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2012-03-02 15:59:55 +00:00
parent 2c94c522fd
commit f303bcd465
27 changed files with 692 additions and 627 deletions

View File

@ -115,6 +115,13 @@ Changes in backwards compatibility policy
* LUCENE-3626: The internal implementation classes in PKIndexSplitter * LUCENE-3626: The internal implementation classes in PKIndexSplitter
and MultiPassIndexSplitter were made private as they now work and MultiPassIndexSplitter were made private as they now work
per segment. (Uwe Schindler) per segment. (Uwe Schindler)
* LUCENE-3807: Cleaned up Suggest / Lookup API. Term weights (freqs) are now
64bit signed integers instead of 32bit floats. Sorting of terms is now a
disk based merge sort instead of an in-memory sort. The Lookup API now
accepts and returns CharSequence instead of String which should be converted
into a String before used in a datastructure that relies on hashCode / equals.
(Simon Willnauer)
Changes in Runtime Behavior Changes in Runtime Behavior

View File

@ -25,6 +25,7 @@ import org.apache.lucene.util.BytesRef;
/** /**
* This wrapper buffers incoming elements. * This wrapper buffers incoming elements.
* @lucene.experimental
*/ */
public class BufferingTermFreqIteratorWrapper implements TermFreqIterator { public class BufferingTermFreqIteratorWrapper implements TermFreqIterator {
// TODO keep this for now // TODO keep this for now

View File

@ -18,81 +18,113 @@ package org.apache.lucene.search.suggest;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator; import java.util.Comparator;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.SorterTemplate; import org.apache.lucene.util.SorterTemplate;
final class BytesRefList { /**
* A simple append only random-access {@link BytesRef} array that stores full
* copies of the appended bytes in a {@link ByteBlockPool}.
*
*
* <b>Note: This class is not Thread-Safe!</b>
*
* @lucene.internal
* @lucene.experimental
*/
public final class BytesRefList {
// TODO rename to BytesRefArray
private final ByteBlockPool pool; private final ByteBlockPool pool;
private int[] offsets = new int[1]; private int[] offsets = new int[1];
private int currentElement = 0; private int lastElement = 0;
private int currentOffset = 0; private int currentOffset = 0;
private final Counter bytesUsed = Counter.newCounter(false);
/**
* Creates a new {@link BytesRefList}
*/
public BytesRefList() { public BytesRefList() {
this(new ByteBlockPool(new ByteBlockPool.DirectAllocator())); this.pool = new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(
} bytesUsed));
public BytesRefList(ByteBlockPool pool) {
this.pool = pool;
pool.nextBuffer(); pool.nextBuffer();
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER
+ RamUsageEstimator.NUM_BYTES_INT);
} }
public int append(BytesRef bytes) { /**
if (currentElement >= offsets.length) { * Clears this {@link BytesRefList}
offsets = ArrayUtil.grow(offsets, offsets.length + 1); */
} public void clear() {
pool.copy(bytes); lastElement = 0;
offsets[currentElement++] = currentOffset; currentOffset = 0;
currentOffset += bytes.length; Arrays.fill(offsets, 0);
return currentElement; pool.reset();
}
public int size() {
return currentElement;
}
public BytesRef get(BytesRef bytes, int pos) {
if (currentElement > pos) {
bytes.offset = offsets[pos];
bytes.length = pos == currentElement - 1 ? currentOffset - bytes.offset
: offsets[pos + 1] - bytes.offset;
pool.copyFrom(bytes);
return bytes;
}
throw new IndexOutOfBoundsException("index " + pos
+ " must be less than the size: " + currentElement);
}
public BytesRefIterator iterator() {
final int numElements = currentElement;
return new BytesRefIterator() {
private final BytesRef spare = new BytesRef();
private int pos = 0;
@Override
public BytesRef next() throws IOException {
if (pos < numElements) {
get(spare, pos++);
return spare;
}
return null;
}
@Override
public Comparator<BytesRef> getComparator() {
return null;
}
};
} }
public int[] sort(final Comparator<BytesRef> comp) { /**
* Appends a copy of the given {@link BytesRef} to this {@link BytesRefList}.
* @param bytes the bytes to append
* @return the ordinal of the appended bytes
*/
public int append(BytesRef bytes) {
if (lastElement >= offsets.length) {
int oldLen = offsets.length;
offsets = ArrayUtil.grow(offsets, offsets.length + 1);
bytesUsed.addAndGet((offsets.length - oldLen)
* RamUsageEstimator.NUM_BYTES_INT);
}
pool.copy(bytes);
offsets[lastElement++] = currentOffset;
currentOffset += bytes.length;
return lastElement;
}
/**
* Returns the current size of this {@link BytesRefList}
* @return the current size of this {@link BytesRefList}
*/
public int size() {
return lastElement;
}
/**
* Returns the <i>n'th</i> element of this {@link BytesRefList}
* @param spare a spare {@link BytesRef} instance
* @param ord the elements ordinal to retrieve
* @return the <i>n'th</i> element of this {@link BytesRefList}
*/
public BytesRef get(BytesRef spare, int ord) {
if (lastElement > ord) {
spare.offset = offsets[ord];
spare.length = ord == lastElement - 1 ? currentOffset - spare.offset
: offsets[ord + 1] - spare.offset;
pool.copyFrom(spare);
return spare;
}
throw new IndexOutOfBoundsException("index " + ord
+ " must be less than the size: " + lastElement);
}
/**
* Returns the number internally used bytes to hold the appended bytes in
* memory
*
* @return the number internally used bytes to hold the appended bytes in
* memory
*/
public long bytesUsed() {
return bytesUsed.get();
}
private int[] sort(final Comparator<BytesRef> comp) {
final int[] orderdEntries = new int[size()]; final int[] orderdEntries = new int[size()];
for (int i = 0; i < orderdEntries.length; i++) { for (int i = 0; i < orderdEntries.length; i++) {
orderdEntries[i] = i; orderdEntries[i] = i;
@ -110,22 +142,65 @@ final class BytesRefList {
final int ord1 = orderdEntries[i], ord2 = orderdEntries[j]; final int ord1 = orderdEntries[i], ord2 = orderdEntries[j];
return comp.compare(get(scratch1, ord1), get(scratch2, ord2)); return comp.compare(get(scratch1, ord1), get(scratch2, ord2));
} }
@Override @Override
protected void setPivot(int i) { protected void setPivot(int i) {
final int ord = orderdEntries[i]; final int ord = orderdEntries[i];
get(pivot, ord); get(pivot, ord);
} }
@Override @Override
protected int comparePivot(int j) { protected int comparePivot(int j) {
final int ord = orderdEntries[j]; final int ord = orderdEntries[j];
return comp.compare(pivot, get(scratch2, ord)); return comp.compare(pivot, get(scratch2, ord));
} }
private final BytesRef pivot = new BytesRef(), private final BytesRef pivot = new BytesRef(), scratch1 = new BytesRef(),
scratch1 = new BytesRef(), scratch2 = new BytesRef(); scratch2 = new BytesRef();
}.quickSort(0, size() - 1); }.quickSort(0, size() - 1);
return orderdEntries; return orderdEntries;
} }
/**
* sugar for {@link #iterator(Comparator)} with a <code>null</code> comparator
*/
public BytesRefIterator iterator() {
return iterator(null);
}
/**
* <p>
* Returns a {@link BytesRefIterator} with point in time semantics. The
* iterator provides access to all so far appended {@link BytesRef} instances.
* </p>
* <p>
* If a non <code>null</code> {@link Comparator} is provided the iterator will
* iterate the byte values in the order specified by the comparator. Otherwise
* the order is the same as the values were appended.
* </p>
* <p>
* This is a non-destructive operation.
* </p>
*/
public BytesRefIterator iterator(final Comparator<BytesRef> comp) {
final BytesRef spare = new BytesRef();
final int size = size();
final int[] ords = comp == null ? null : sort(comp);
return new BytesRefIterator() {
int pos = 0;
@Override
public BytesRef next() throws IOException {
if (pos < size) {
return get(spare, ords == null ? pos++ : ords[pos++]);
}
return null;
}
@Override
public Comparator<BytesRef> getComparator() {
return comp;
}
};
}
} }

View File

@ -75,7 +75,11 @@ public class FileDictionary implements Dictionary {
String[] fields = line.split("\t"); String[] fields = line.split("\t");
if (fields.length > 1) { if (fields.length > 1) {
// keep reading floats for bw compat // keep reading floats for bw compat
curFreq = (int)Float.parseFloat(fields[1]); try {
curFreq = Long.parseLong(fields[1]);
} catch (NumberFormatException e) {
curFreq = (long)Double.parseDouble(fields[1]);
}
spare.copyChars(fields[0]); spare.copyChars(fields[0]);
} else { } else {
spare.copyChars(line); spare.copyChars(line);

View File

@ -29,15 +29,19 @@ import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
/**
* Simple Lookup interface for {@link CharSequence} suggestions.
* @lucene.experimental
*/
public abstract class Lookup { public abstract class Lookup {
/** /**
* Result of a lookup. * Result of a lookup.
*/ */
public static final class LookupResult implements Comparable<LookupResult> { public static final class LookupResult implements Comparable<LookupResult> {
public final CharSequence key; public final CharSequence key;
public final float value; public final long value;
public LookupResult(CharSequence key, float value) { public LookupResult(CharSequence key, long value) {
this.key = key; this.key = key;
this.value = value; this.value = value;
} }
@ -112,6 +116,10 @@ public abstract class Lookup {
build(tfit); build(tfit);
} }
/**
* Builds up a new internal {@link Lookup} representation based on the given {@link TermFreqIterator}.
* The implementation might re-sort the data internally.
*/
public abstract void build(TermFreqIterator tfit) throws IOException; public abstract void build(TermFreqIterator tfit) throws IOException;
/** /**
@ -124,22 +132,7 @@ public abstract class Lookup {
*/ */
public abstract List<LookupResult> lookup(CharSequence key, boolean onlyMorePopular, int num); public abstract List<LookupResult> lookup(CharSequence key, boolean onlyMorePopular, int num);
/**
* Modify the lookup data by recording additional data. Optional operation.
* @param key new lookup key
* @param value value to associate with this key
* @return true if new key is added, false if it already exists or operation
* is not supported.
*/
public abstract boolean add(CharSequence key, Object value);
/**
* Get value associated with a specific key.
* @param key lookup key
* @return associated value
*/
public abstract Object get(CharSequence key);
/** /**
* Persist the constructed lookup data to a directory. Optional operation. * Persist the constructed lookup data to a directory. Optional operation.
* @param output {@link OutputStream} to write the data to. * @param output {@link OutputStream} to write the data to.
@ -173,4 +166,5 @@ public abstract class Lookup {
* @throws IOException when fatal IO error occurs. * @throws IOException when fatal IO error occurs.
*/ */
public abstract boolean load(File storeDir) throws IOException; public abstract boolean load(File storeDir) throws IOException;
} }

View File

@ -17,45 +17,166 @@ package org.apache.lucene.search.suggest;
* limitations under the License. * limitations under the License.
*/ */
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Comparator; import java.util.Comparator;
import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.fst.Sort;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/** /**
* This wrapper buffers incoming elements and makes sure they are sorted in * This wrapper buffers incoming elements and makes sure they are sorted based on given comparator.
* ascending lexicographic order. * @lucene.experimental
*/ */
public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper { public class SortedTermFreqIteratorWrapper implements TermFreqIterator {
// TODO keep this for now - but the consumer should really sort this stuff on disk with sorter...
private final int[] sortedOrds; private final TermFreqIterator source;
private int currentOrd = -1; private File tempInput;
private final BytesRef spare = new BytesRef(); private File tempSorted;
private final Comparator<BytesRef> comp; private final ByteSequencesReader reader;
private boolean done = false;
public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comp) throws IOException {
super(source); private long weight;
this.sortedOrds = entries.sort(comp); private final BytesRef scratch = new BytesRef();
this.comp = comp; private final Comparator<BytesRef> comparator;
public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator) throws IOException {
this(source, comparator, false);
} }
@Override public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator, boolean compareRawBytes) throws IOException {
public long weight() { this.source = source;
return freqs[currentOrd]; this.comparator = comparator;
this.reader = sort(compareRawBytes ? comparator : new BytesOnlyComparator(this.comparator));
} }
@Override @Override
public BytesRef next() throws IOException { public BytesRef next() throws IOException {
if (++curPos < entries.size()) { boolean success = false;
return entries.get(spare, (currentOrd = sortedOrds[curPos])); if (done) {
return null;
}
try {
ByteArrayDataInput input = new ByteArrayDataInput();
if (reader.read(scratch)) {
weight = decode(scratch, input);
success = true;
return scratch;
}
close();
success = done = true;
return null;
} finally {
if (!success) {
done = true;
close();
}
} }
return null;
} }
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
return comp; return comparator;
}
@Override
public long weight() {
return weight;
}
private Sort.ByteSequencesReader sort(Comparator<BytesRef> comparator) throws IOException {
String prefix = getClass().getSimpleName();
File directory = Sort.defaultTempDir();
tempInput = File.createTempFile(prefix, ".input", directory);
tempSorted = File.createTempFile(prefix, ".sorted", directory);
final Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
boolean success = false;
try {
BytesRef spare;
byte[] buffer = new byte[0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
while ((spare = source.next()) != null) {
encode(writer, output, buffer, spare, source.weight());
}
writer.close();
new Sort(comparator).sort(tempInput, tempSorted);
ByteSequencesReader reader = new Sort.ByteSequencesReader(tempSorted);
success = true;
return reader;
} finally {
if (success) {
IOUtils.close(writer);
} else {
try {
IOUtils.closeWhileHandlingException(writer);
} finally {
close();
}
}
}
}
private void close() throws IOException {
if (tempInput != null) {
tempInput.delete();
}
if (tempSorted != null) {
tempSorted.delete();
}
IOUtils.close(reader);
}
private final static class BytesOnlyComparator implements Comparator<BytesRef> {
final Comparator<BytesRef> other;
private final BytesRef leftScratch = new BytesRef();
private final BytesRef rightScratch = new BytesRef();
public BytesOnlyComparator(Comparator<BytesRef> other) {
this.other = other;
}
@Override
public int compare(BytesRef left, BytesRef right) {
wrap(leftScratch, left);
wrap(rightScratch, right);
return other.compare(leftScratch, rightScratch);
}
private void wrap(BytesRef wrapper, BytesRef source) {
wrapper.bytes = source.bytes;
wrapper.offset = source.offset;
wrapper.length = source.length - 8;
}
}
protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException {
if (spare.length + 8 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, spare.length + 8);
}
output.reset(buffer);
output.writeBytes(spare.bytes, spare.offset, spare.length);
output.writeLong(weight);
writer.write(buffer, 0, output.getPosition());
}
protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
tmpInput.reset(scratch.bytes);
tmpInput.skipBytes(scratch.length - 8); // suggestion + separator
scratch.length -= 8; // sep + long
return tmpInput.readLong();
} }
} }

View File

@ -26,6 +26,7 @@ import org.apache.lucene.util.BytesRef;
/** /**
* This wrapper buffers the incoming elements and makes sure they are in * This wrapper buffers the incoming elements and makes sure they are in
* random order. * random order.
* @lucene.experimental
*/ */
public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper { public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper {
// TODO keep this for now // TODO keep this for now

View File

@ -18,13 +18,16 @@ package org.apache.lucene.search.suggest.fst;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Comparator;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
/** /**
* Collects {@link BytesRef} and then allows one to iterate over their sorted order. Implementations * Collects {@link BytesRef} and then allows one to iterate over their sorted order. Implementations
* of this interface will be called in a single-threaded scenario. * of this interface will be called in a single-threaded scenario.
* @lucene.experimental
* @lucene.internal
*/ */
public interface BytesRefSorter { public interface BytesRefSorter {
/** /**
@ -42,5 +45,7 @@ public interface BytesRefSorter {
* *
* @throws IOException If an I/O exception occurs. * @throws IOException If an I/O exception occurs.
*/ */
Iterator<BytesRef> iterator() throws IOException; BytesRefIterator iterator() throws IOException;
Comparator<BytesRef> getComparator();
} }

View File

@ -18,59 +18,63 @@ package org.apache.lucene.search.suggest.fst;
*/ */
import java.io.*; import java.io.*;
import java.util.Iterator; import java.util.Comparator;
import java.util.NoSuchElementException;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader; import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IOUtils;
/** /**
* Builds and iterates over sequences stored on disk. * Builds and iterates over sequences stored on disk.
* @lucene.experimental
* @lucene.internal
*/ */
public class ExternalRefSorter implements BytesRefSorter, Closeable { public class ExternalRefSorter implements BytesRefSorter, Closeable {
private final Sort sort; private final Sort sort;
private Sort.ByteSequencesWriter writer; private Sort.ByteSequencesWriter writer;
private File input; private File input;
private File sorted; private File sorted;
/** /**
* Will buffer all sequences to a temporary file and then sort (all on-disk). * Will buffer all sequences to a temporary file and then sort (all on-disk).
*/ */
public ExternalRefSorter(Sort sort) throws IOException { public ExternalRefSorter(Sort sort) throws IOException {
this.sort = sort; this.sort = sort;
this.input = File.createTempFile("RefSorter-", ".raw", Sort.defaultTempDir()); this.input = File.createTempFile("RefSorter-", ".raw",
Sort.defaultTempDir());
this.writer = new Sort.ByteSequencesWriter(input); this.writer = new Sort.ByteSequencesWriter(input);
} }
@Override @Override
public void add(BytesRef utf8) throws IOException { public void add(BytesRef utf8) throws IOException {
if (writer == null) if (writer == null) throw new IllegalStateException();
throw new IllegalStateException();
writer.write(utf8); writer.write(utf8);
} }
@Override public BytesRefIterator iterator() throws IOException {
public Iterator<BytesRef> iterator() throws IOException {
if (sorted == null) { if (sorted == null) {
closeWriter(); closeWriter();
sorted = File.createTempFile("RefSorter-", ".sorted", Sort.defaultTempDir()); sorted = File.createTempFile("RefSorter-", ".sorted",
Sort.defaultTempDir());
sort.sort(input, sorted); sort.sort(input, sorted);
input.delete(); input.delete();
input = null; input = null;
} }
return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted)); return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted),
sort.getComparator());
} }
private void closeWriter() throws IOException { private void closeWriter() throws IOException {
if (writer != null) { if (writer != null) {
writer.close(); writer.close();
writer = null; writer = null;
} }
} }
/** /**
* Removes any written temporary files. * Removes any written temporary files.
*/ */
@ -83,40 +87,54 @@ public class ExternalRefSorter implements BytesRefSorter, Closeable {
if (sorted != null) sorted.delete(); if (sorted != null) sorted.delete();
} }
} }
/** /**
* Iterate over byte refs in a file. * Iterate over byte refs in a file.
*/ */
class ByteSequenceIterator implements Iterator<BytesRef> { class ByteSequenceIterator implements BytesRefIterator {
private ByteSequencesReader reader; private final ByteSequencesReader reader;
private byte[] next; private BytesRef scratch = new BytesRef();
private final Comparator<BytesRef> comparator;
public ByteSequenceIterator(ByteSequencesReader reader) throws IOException {
public ByteSequenceIterator(ByteSequencesReader reader,
Comparator<BytesRef> comparator) {
this.reader = reader; this.reader = reader;
this.next = reader.read(); this.comparator = comparator;
}
@Override
public boolean hasNext() {
return next != null;
} }
@Override @Override
public BytesRef next() { public BytesRef next() throws IOException {
if (next == null) throw new NoSuchElementException(); if (scratch == null) {
BytesRef r = new BytesRef(next); return null;
try { }
next = reader.read(); boolean success = false;
if (next == null) { try {
reader.close(); byte[] next = reader.read();
} if (next != null) {
} catch (IOException e) { scratch.bytes = next;
throw new RuntimeException(e); scratch.length = next.length;
scratch.offset = 0;
} else {
IOUtils.close(reader);
scratch = null;
}
success = true;
return scratch;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(reader);
}
} }
return r;
} }
@Override @Override
public void remove() { throw new UnsupportedOperationException(); } public Comparator<BytesRef> getComparator() {
return comparator;
}
}
@Override
public Comparator<BytesRef> getComparator() {
return sort.getComparator();
} }
} }

View File

@ -28,6 +28,7 @@ import org.apache.lucene.util.fst.FST.Arc;
* Finite state automata based implementation of "autocomplete" functionality. * Finite state automata based implementation of "autocomplete" functionality.
* *
* @see FSTCompletionBuilder * @see FSTCompletionBuilder
* @lucene.experimental
*/ */
// TODO: we could store exact weights as outputs from the FST (int4 encoded // TODO: we could store exact weights as outputs from the FST (int4 encoded
@ -159,10 +160,10 @@ public class FSTCompletion {
* @param utf8 * @param utf8
* The sequence of utf8 bytes to follow. * The sequence of utf8 bytes to follow.
* *
* @return Returns the bucket number of the match or <code>null</code> if no * @return Returns the bucket number of the match or <code>-1</code> if no
* match was found. * match was found.
*/ */
private Integer getExactMatchStartingFromRootArc( private int getExactMatchStartingFromRootArc(
int rootArcIndex, BytesRef utf8) { int rootArcIndex, BytesRef utf8) {
// Get the UTF-8 bytes representation of the input key. // Get the UTF-8 bytes representation of the input key.
try { try {
@ -186,7 +187,7 @@ public class FSTCompletion {
} }
// No match. // No match.
return null; return -1;
} }
/** /**
@ -273,8 +274,8 @@ public class FSTCompletion {
// exact match, if requested. // exact match, if requested.
if (exactFirst) { if (exactFirst) {
if (!checkExistingAndReorder(res, key)) { if (!checkExistingAndReorder(res, key)) {
Integer exactMatchBucket = getExactMatchStartingFromRootArc(i, key); int exactMatchBucket = getExactMatchStartingFromRootArc(i, key);
if (exactMatchBucket != null) { if (exactMatchBucket != -1) {
// Insert as the first result and truncate at num. // Insert as the first result and truncate at num.
while (res.size() >= num) { while (res.size() >= num) {
res.remove(res.size() - 1); res.remove(res.size() - 1);
@ -385,10 +386,10 @@ public class FSTCompletion {
} }
/** /**
* Returns the bucket assigned to a given key (if found) or <code>null</code> if * Returns the bucket assigned to a given key (if found) or <code>-1</code> if
* no exact match exists. * no exact match exists.
*/ */
public Integer getBucket(CharSequence key) { public int getBucket(CharSequence key) {
return getExactMatchStartingFromRootArc(0, new BytesRef(key)); return getExactMatchStartingFromRootArc(0, new BytesRef(key));
} }

View File

@ -19,9 +19,9 @@ package org.apache.lucene.search.suggest.fst;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.*; import org.apache.lucene.util.fst.*;
@ -98,6 +98,7 @@ import org.apache.lucene.util.fst.*;
* change, requiring you to rebuild the FST suggest index. * change, requiring you to rebuild the FST suggest index.
* *
* @see FSTCompletion * @see FSTCompletion
* @lucene.experimental
*/ */
public class FSTCompletionBuilder { public class FSTCompletionBuilder {
/** /**
@ -143,10 +144,11 @@ public class FSTCompletionBuilder {
/** /**
* Creates an {@link FSTCompletion} with default options: 10 buckets, exact match * Creates an {@link FSTCompletion} with default options: 10 buckets, exact match
* promoted to first position and {@link InMemorySorter}. * promoted to first position and {@link InMemorySorter} with a comparator obtained from
* {@link BytesRef#getUTF8SortedAsUnicodeComparator()}.
*/ */
public FSTCompletionBuilder() { public FSTCompletionBuilder() {
this(DEFAULT_BUCKETS, new InMemorySorter(), Integer.MAX_VALUE); this(DEFAULT_BUCKETS, new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator()), Integer.MAX_VALUE);
} }
/** /**
@ -237,10 +239,12 @@ public class FSTCompletionBuilder {
shareMaxTailLength, outputs, null, false); shareMaxTailLength, outputs, null, false);
BytesRef scratch = new BytesRef(); BytesRef scratch = new BytesRef();
BytesRef entry;
final IntsRef scratchIntsRef = new IntsRef(); final IntsRef scratchIntsRef = new IntsRef();
int count = 0; int count = 0;
for (Iterator<BytesRef> i = sorter.iterator(); i.hasNext(); count++) { BytesRefIterator iter = sorter.iterator();
BytesRef entry = i.next(); while((entry = iter.next()) != null) {
count++;
if (scratch.compareTo(entry) != 0) { if (scratch.compareTo(entry) != 0) {
builder.add(Util.toIntsRef(entry, scratchIntsRef), empty); builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
scratch.copyBytes(entry); scratch.copyBytes(entry);

View File

@ -59,6 +59,7 @@ import org.apache.lucene.util.fst.NoOutputs;
* use {@link FSTCompletion} directly or {@link TSTLookup}, for example. * use {@link FSTCompletion} directly or {@link TSTLookup}, for example.
* *
* @see FSTCompletion * @see FSTCompletion
* @lucene.experimental
*/ */
public class FSTCompletionLookup extends Lookup { public class FSTCompletionLookup extends Lookup {
/** /**
@ -171,7 +172,7 @@ public class FSTCompletionLookup extends Lookup {
} }
output.reset(buffer); output.reset(buffer);
output.writeInt(FloatMagic.toSortable(tfit.weight())); output.writeInt(encodeWeight(tfit.weight()));
output.writeBytes(spare.bytes, spare.offset, spare.length); output.writeBytes(spare.bytes, spare.offset, spare.length);
writer.write(buffer, 0, output.getPosition()); writer.write(buffer, 0, output.getPosition());
} }
@ -188,13 +189,13 @@ public class FSTCompletionLookup extends Lookup {
reader = new Sort.ByteSequencesReader(tempSorted); reader = new Sort.ByteSequencesReader(tempSorted);
long line = 0; long line = 0;
int previousBucket = 0; int previousBucket = 0;
float previousScore = 0; int previousScore = 0;
ByteArrayDataInput input = new ByteArrayDataInput(); ByteArrayDataInput input = new ByteArrayDataInput();
BytesRef tmp1 = new BytesRef(); BytesRef tmp1 = new BytesRef();
BytesRef tmp2 = new BytesRef(); BytesRef tmp2 = new BytesRef();
while (reader.read(tmp1)) { while (reader.read(tmp1)) {
input.reset(tmp1.bytes); input.reset(tmp1.bytes);
float currentScore = FloatMagic.fromSortable(input.readInt()); int currentScore = input.readInt();
int bucket; int bucket;
if (line > 0 && currentScore == previousScore) { if (line > 0 && currentScore == previousScore) {
@ -230,6 +231,14 @@ public class FSTCompletionLookup extends Lookup {
tempSorted.delete(); tempSorted.delete();
} }
} }
/** weight -> cost */
private static int encodeWeight(long value) {
if (value < Integer.MIN_VALUE || value > Integer.MAX_VALUE) {
throw new UnsupportedOperationException("cannot encode value: " + value);
}
return (int)value;
}
@Override @Override
public List<LookupResult> lookup(CharSequence key, boolean higherWeightsFirst, int num) { public List<LookupResult> lookup(CharSequence key, boolean higherWeightsFirst, int num) {
@ -250,19 +259,9 @@ public class FSTCompletionLookup extends Lookup {
return results; return results;
} }
@Override
public boolean add(CharSequence key, Object value) {
// Not supported.
return false;
}
@Override
public Object get(CharSequence key) { public Object get(CharSequence key) {
Integer bucket = normalCompletion.getBucket(key); final int bucket = normalCompletion.getBucket(key);
if (bucket == null) return bucket == -1 ? null : Long.valueOf(bucket);
return null;
else
return (float) normalCompletion.getBucket(key) / normalCompletion.getBucketCount();
} }
/** /**

View File

@ -1,75 +0,0 @@
package org.apache.lucene.search.suggest.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.NumericUtils;
/**
* Converts normalized float representations ({@link Float#floatToIntBits(float)})
* into integers that are directly sortable in int4 representation (or unsigned values or
* after promoting to a long with higher 32-bits zeroed).
*/
class FloatMagic {
/**
* Convert a float to a directly sortable unsigned integer. For sortable signed
* integers, see {@link NumericUtils#floatToSortableInt(float)}.
*/
public static int toSortable(float f) {
return floatBitsToUnsignedOrdered(Float.floatToRawIntBits(f));
}
/**
* Back from {@link #toSortable(float)} to float.
*/
public static float fromSortable(int v) {
return Float.intBitsToFloat(unsignedOrderedToFloatBits(v));
}
/**
* Convert float bits to directly sortable bits.
* Normalizes all NaNs to canonical form.
*/
static int floatBitsToUnsignedOrdered(int v) {
// Canonicalize NaN ranges. I assume this check will be faster here than
// (v == v) == false on the FPU? We don't distinguish between different
// flavors of NaNs here (see http://en.wikipedia.org/wiki/NaN). I guess
// in Java this doesn't matter much anyway.
if ((v & 0x7fffffff) > 0x7f800000) {
// Apply the logic below to a canonical "quiet NaN"
return 0x7fc00000 ^ 0x80000000;
}
if (v < 0) {
// Reverse the order of negative values and push them before positive values.
return ~v;
} else {
// Shift positive values after negative, but before NaNs, they're sorted already.
return v ^ 0x80000000;
}
}
/**
* Back from {@link #floatBitsToUnsignedOrdered(int)}.
*/
static int unsignedOrderedToFloatBits(int v) {
if (v < 0)
return v & ~0x80000000;
else
return ~v;
}
}

View File

@ -17,29 +17,40 @@ package org.apache.lucene.search.suggest.fst;
* limitations under the License. * limitations under the License.
*/ */
import java.util.*; import java.util.Comparator;
import org.apache.lucene.search.suggest.BytesRefList;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
/** /**
* An {@link BytesRefSorter} that keeps all the entries in memory. * An {@link BytesRefSorter} that keeps all the entries in memory.
* @lucene.experimental
* @lucene.internal
*/ */
public final class InMemorySorter implements BytesRefSorter { public final class InMemorySorter implements BytesRefSorter {
// TODO: use a single byte[] to back up all entries? private final BytesRefList buffer = new BytesRefList();
private final ArrayList<BytesRef> refs = new ArrayList<BytesRef>();
private boolean closed = false; private boolean closed = false;
private final Comparator<BytesRef> comparator;
public InMemorySorter(Comparator<BytesRef> comparator) {
this.comparator = comparator;
}
@Override @Override
public void add(BytesRef utf8) { public void add(BytesRef utf8) {
if (closed) throw new IllegalStateException(); if (closed) throw new IllegalStateException();
refs.add(BytesRef.deepCopyOf(utf8)); buffer.append(utf8);
} }
@Override @Override
public Iterator<BytesRef> iterator() { public BytesRefIterator iterator() {
closed = true; closed = true;
Collections.sort(refs, BytesRef.getUTF8SortedAsUnicodeComparator()); return buffer.iterator(comparator);
return Collections.unmodifiableCollection(refs).iterator(); }
@Override
public Comparator<BytesRef> getComparator() {
return comparator;
} }
} }

View File

@ -20,15 +20,10 @@ package org.apache.lucene.search.suggest.fst;
import java.io.*; import java.io.*;
import java.util.*; import java.util.*;
import org.apache.lucene.search.suggest.BytesRefList;
import org.apache.lucene.util.*; import org.apache.lucene.util.*;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
// TODO: the buffer is currently byte[][] which with very small arrays will terribly overallocate
// memory (alignments) and make GC very happy.
//
// We could move it to a single byte[] + and use custom sorting, but we'd need to check if this
// yields any improvement first.
/** /**
* On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following * On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following
* fields: * fields:
@ -38,6 +33,8 @@ import org.apache.lucene.util.PriorityQueue;
* </ul> * </ul>
* *
* @see #sort(File, File) * @see #sort(File, File)
* @lucene.experimental
* @lucene.internal
*/ */
public final class Sort { public final class Sort {
public final static int MB = 1024 * 1024; public final static int MB = 1024 * 1024;
@ -59,11 +56,6 @@ public final class Sort {
*/ */
public final static int MAX_TEMPFILES = 128; public final static int MAX_TEMPFILES = 128;
/**
* Minimum slot buffer expansion.
*/
private final static int MIN_EXPECTED_GROWTH = 1000;
/** /**
* A bit more descriptive unit for constructors. * A bit more descriptive unit for constructors.
* *
@ -111,21 +103,6 @@ public final class Sort {
} }
} }
/**
* byte[] in unsigned byte order.
*/
static final Comparator<byte[]> unsignedByteOrderComparator = new Comparator<byte[]>() {
public int compare(byte[] left, byte[] right) {
final int max = Math.min(left.length, right.length);
for (int i = 0, j = 0; i < max; i++, j++) {
int diff = (left[i] & 0xff) - (right[j] & 0xff);
if (diff != 0)
return diff;
}
return left.length - right.length;
}
};
/** /**
* Sort info (debugging mostly). * Sort info (debugging mostly).
*/ */
@ -149,14 +126,15 @@ public final class Sort {
} }
} }
private final static byte [][] EMPTY = new byte [0][];
private final BufferSize ramBufferSize; private final BufferSize ramBufferSize;
private final File tempDirectory; private final File tempDirectory;
private byte [][] buffer = new byte [0][]; private final BytesRefList buffer = new BytesRefList();
private SortInfo sortInfo; private SortInfo sortInfo;
private int maxTempFiles; private int maxTempFiles;
private final Comparator<BytesRef> comparator;
public static final Comparator<BytesRef> DEFAULT_COMPARATOR = BytesRef.getUTF8SortedAsUnicodeComparator();
/** /**
* Defaults constructor. * Defaults constructor.
@ -165,13 +143,17 @@ public final class Sort {
* @see BufferSize#automatic() * @see BufferSize#automatic()
*/ */
public Sort() throws IOException { public Sort() throws IOException {
this(BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES); this(DEFAULT_COMPARATOR, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
}
public Sort(Comparator<BytesRef> comparator) throws IOException {
this(comparator, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
} }
/** /**
* All-details constructor. * All-details constructor.
*/ */
public Sort(BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) { public Sort(Comparator<BytesRef> comparator, BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) {
if (ramBufferSize.bytes < ABSOLUTE_MIN_SORT_BUFFER_SIZE) { if (ramBufferSize.bytes < ABSOLUTE_MIN_SORT_BUFFER_SIZE) {
throw new IllegalArgumentException(MIN_BUFFER_SIZE_MSG + ": " + ramBufferSize.bytes); throw new IllegalArgumentException(MIN_BUFFER_SIZE_MSG + ": " + ramBufferSize.bytes);
} }
@ -183,6 +165,7 @@ public final class Sort {
this.ramBufferSize = ramBufferSize; this.ramBufferSize = ramBufferSize;
this.tempDirectory = tempDirectory; this.tempDirectory = tempDirectory;
this.maxTempFiles = maxTempfiles; this.maxTempFiles = maxTempfiles;
this.comparator = comparator;
} }
/** /**
@ -283,23 +266,25 @@ public final class Sort {
/** Sort a single partition in-memory. */ /** Sort a single partition in-memory. */
protected File sortPartition(int len) throws IOException { protected File sortPartition(int len) throws IOException {
byte [][] data = this.buffer; BytesRefList data = this.buffer;
File tempFile = File.createTempFile("sort", "partition", tempDirectory); File tempFile = File.createTempFile("sort", "partition", tempDirectory);
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
Arrays.sort(data, 0, len, unsignedByteOrderComparator);
sortInfo.sortTime += (System.currentTimeMillis() - start); sortInfo.sortTime += (System.currentTimeMillis() - start);
ByteSequencesWriter out = new ByteSequencesWriter(tempFile); final ByteSequencesWriter out = new ByteSequencesWriter(tempFile);
BytesRef spare;
try { try {
for (int i = 0; i < len; i++) { BytesRefIterator iter = buffer.iterator(comparator);
assert data[i].length <= Short.MAX_VALUE; while((spare = iter.next()) != null) {
out.write(data[i]); assert spare.length <= Short.MAX_VALUE;
out.write(spare);
} }
out.close(); out.close();
// Clean up the buffer for the next partition. // Clean up the buffer for the next partition.
this.buffer = EMPTY; data.clear();
return tempFile; return tempFile;
} finally { } finally {
IOUtils.close(out); IOUtils.close(out);
@ -314,7 +299,7 @@ public final class Sort {
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(merges.size()) { PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(merges.size()) {
protected boolean lessThan(FileAndTop a, FileAndTop b) { protected boolean lessThan(FileAndTop a, FileAndTop b) {
return a.current.compareTo(b.current) < 0; return comparator.compare(a.current, b.current) < 0;
} }
}; };
@ -359,33 +344,18 @@ public final class Sort {
/** Read in a single partition of data */ /** Read in a single partition of data */
int readPartition(ByteSequencesReader reader) throws IOException { int readPartition(ByteSequencesReader reader) throws IOException {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
final BytesRef scratch = new BytesRef();
// We will be reallocating from scratch. while ((scratch.bytes = reader.read()) != null) {
Arrays.fill(this.buffer, null); scratch.length = scratch.bytes.length;
buffer.append(scratch);
int bytesLimit = this.ramBufferSize.bytes;
byte [][] data = this.buffer;
byte[] line;
int linesRead = 0;
while ((line = reader.read()) != null) {
if (linesRead + 1 >= data.length) {
data = Arrays.copyOf(data,
ArrayUtil.oversize(linesRead + MIN_EXPECTED_GROWTH,
RamUsageEstimator.NUM_BYTES_OBJECT_REF));
}
data[linesRead++] = line;
// Account for the created objects. // Account for the created objects.
// (buffer slots do not account to buffer size.) // (buffer slots do not account to buffer size.)
bytesLimit -= line.length + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER; if (ramBufferSize.bytes < buffer.bytesUsed()) {
if (bytesLimit < 0) {
break; break;
} }
} }
this.buffer = data;
sortInfo.readTime += (System.currentTimeMillis() - start); sortInfo.readTime += (System.currentTimeMillis() - start);
return linesRead; return buffer.size();
} }
static class FileAndTop { static class FileAndTop {
@ -515,5 +485,9 @@ public final class Sort {
((Closeable) is).close(); ((Closeable) is).close();
} }
} }
}
public Comparator<BytesRef> getComparator() {
return comparator;
} }
} }

View File

@ -28,6 +28,8 @@ import java.util.List;
import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.store.InputStreamDataInput;
@ -102,72 +104,27 @@ public class WFSTCompletionLookup extends Lookup {
@Override @Override
public void build(TermFreqIterator iterator) throws IOException { public void build(TermFreqIterator iterator) throws IOException {
String prefix = getClass().getSimpleName();
File directory = Sort.defaultTempDir();
File tempInput = File.createTempFile(prefix, ".input", directory);
File tempSorted = File.createTempFile(prefix, ".sorted", directory);
Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
Sort.ByteSequencesReader reader = null;
BytesRef scratch = new BytesRef(); BytesRef scratch = new BytesRef();
TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator,
boolean success = false; BytesRef.getUTF8SortedAsUnicodeComparator());
try { IntsRef scratchInts = new IntsRef();
byte [] buffer = new byte [0]; BytesRef previous = null;
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
BytesRef spare; Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
while ((spare = iterator.next()) != null) { while ((scratch = iter.next()) != null) {
if (spare.length + 5 >= buffer.length) { long cost = iter.weight();
buffer = ArrayUtil.grow(buffer, spare.length + 5);
}
output.reset(buffer);
output.writeBytes(spare.bytes, spare.offset, spare.length);
output.writeByte((byte)0); // separator: not used, just for sort order
output.writeInt((int)encodeWeight(iterator.weight()));
writer.write(buffer, 0, output.getPosition());
}
writer.close();
new Sort().sort(tempInput, tempSorted);
reader = new Sort.ByteSequencesReader(tempSorted);
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); if (previous == null) {
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); previous = new BytesRef();
} else if (scratch.equals(previous)) {
BytesRef previous = null; continue; // for duplicate suggestions, the best weight is actually
BytesRef suggestion = new BytesRef(); // added
IntsRef scratchInts = new IntsRef();
ByteArrayDataInput input = new ByteArrayDataInput();
while (reader.read(scratch)) {
suggestion.bytes = scratch.bytes;
suggestion.offset = scratch.offset;
suggestion.length = scratch.length - 5; // int + separator
input.reset(scratch.bytes);
input.skipBytes(suggestion.length + 1); // suggestion + separator
long cost = input.readInt();
if (previous == null) {
previous = new BytesRef();
} else if (suggestion.equals(previous)) {
continue; // for duplicate suggestions, the best weight is actually added
}
Util.toIntsRef(suggestion, scratchInts);
builder.add(scratchInts, cost);
previous.copyBytes(suggestion);
} }
fst = builder.finish(); Util.toIntsRef(scratch, scratchInts);
success = true; builder.add(scratchInts, cost);
} finally { previous.copyBytes(scratch);
if (success) {
IOUtils.close(reader, writer);
} else {
IOUtils.closeWhileHandlingException(reader, writer);
}
tempInput.delete();
tempSorted.delete();
} }
fst = builder.finish();
} }
@Override @Override
@ -270,16 +227,10 @@ public class WFSTCompletionLookup extends Lookup {
return output; return output;
} }
@Override
public boolean add(CharSequence key, Object value) {
return false; // Not supported.
}
/** /**
* Returns the weight associated with an input string, * Returns the weight associated with an input string,
* or null if it does not exist. * or null if it does not exist.
*/ */
@Override
public Object get(CharSequence key) { public Object get(CharSequence key) {
Arc<Long> arc = new Arc<Long>(); Arc<Long> arc = new Arc<Long>();
Long result = null; Long result = null;
@ -289,23 +240,51 @@ public class WFSTCompletionLookup extends Lookup {
if (result == null || !arc.isFinal()) { if (result == null || !arc.isFinal()) {
return null; return null;
} else { } else {
return decodeWeight(result + arc.nextFinalOutput); return Integer.valueOf(decodeWeight(result + arc.nextFinalOutput));
} }
} }
/** cost -> weight */ /** cost -> weight */
private static float decodeWeight(long encoded) { private static int decodeWeight(long encoded) {
return Integer.MAX_VALUE - encoded; return (int)(Integer.MAX_VALUE - encoded);
} }
/** weight -> cost */ /** weight -> cost */
private static long encodeWeight(float value) { private static int encodeWeight(long value) {
if (Float.isNaN(value) || Float.isInfinite(value) || value < 0 || value > Integer.MAX_VALUE) { if (value < 0 || value > Integer.MAX_VALUE) {
throw new UnsupportedOperationException("cannot encode value: " + value); throw new UnsupportedOperationException("cannot encode value: " + value);
} }
return Integer.MAX_VALUE - (int)value; return Integer.MAX_VALUE - (int)value;
} }
private final class WFSTTermFreqIteratorWrapper extends SortedTermFreqIteratorWrapper {
WFSTTermFreqIteratorWrapper(TermFreqIterator source,
Comparator<BytesRef> comparator) throws IOException {
super(source, comparator, true);
}
@Override
protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException {
if (spare.length + 5 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, spare.length + 5);
}
output.reset(buffer);
output.writeBytes(spare.bytes, spare.offset, spare.length);
output.writeByte((byte)0); // separator: not used, just for sort order
output.writeInt(encodeWeight(weight));
writer.write(buffer, 0, output.getPosition());
}
@Override
protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
tmpInput.reset(scratch.bytes);
tmpInput.skipBytes(scratch.length - 4); // suggestion + separator
scratch.length -= 5; // sep + long
return tmpInput.readInt();
}
}
static final Comparator<Long> weightComparator = new Comparator<Long> () { static final Comparator<Long> weightComparator = new Comparator<Long> () {
public int compare(Long left, Long right) { public int compare(Long left, Long right) {
return left.compareTo(right); return left.compareTo(right);

View File

@ -55,24 +55,22 @@ public class JaspellLookup extends Lookup {
final CharsRef charsSpare = new CharsRef(); final CharsRef charsSpare = new CharsRef();
while ((spare = tfit.next()) != null) { while ((spare = tfit.next()) != null) {
float freq = tfit.weight(); final long weight = tfit.weight();
if (spare.length == 0) { if (spare.length == 0) {
continue; continue;
} }
charsSpare.grow(spare.length); charsSpare.grow(spare.length);
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
trie.put(charsSpare.toString(), new Float(freq)); trie.put(charsSpare.toString(), Long.valueOf(weight));
} }
} }
@Override
public boolean add(CharSequence key, Object value) { public boolean add(CharSequence key, Object value) {
trie.put(key, value); trie.put(key, value);
// XXX // XXX
return false; return false;
} }
@Override
public Object get(CharSequence key) { public Object get(CharSequence key) {
return trie.get(key); return trie.get(key);
} }
@ -95,7 +93,7 @@ public class JaspellLookup extends Lookup {
if (onlyMorePopular) { if (onlyMorePopular) {
LookupPriorityQueue queue = new LookupPriorityQueue(num); LookupPriorityQueue queue = new LookupPriorityQueue(num);
for (String s : list) { for (String s : list) {
float freq = (Float)trie.get(s); long freq = ((Number)trie.get(s)).longValue();
queue.insertWithOverflow(new LookupResult(new CharsRef(s), freq)); queue.insertWithOverflow(new LookupResult(new CharsRef(s), freq));
} }
for (LookupResult lr : queue.getResults()) { for (LookupResult lr : queue.getResults()) {
@ -104,7 +102,7 @@ public class JaspellLookup extends Lookup {
} else { } else {
for (int i = 0; i < maxCnt; i++) { for (int i = 0; i < maxCnt; i++) {
String s = list.get(i); String s = list.get(i);
float freq = (Float)trie.get(s); long freq = ((Number)trie.get(s)).longValue();
res.add(new LookupResult(new CharsRef(s), freq)); res.add(new LookupResult(new CharsRef(s), freq));
} }
} }
@ -131,7 +129,7 @@ public class JaspellLookup extends Lookup {
node.splitchar = in.readChar(); node.splitchar = in.readChar();
byte mask = in.readByte(); byte mask = in.readByte();
if ((mask & HAS_VALUE) != 0) { if ((mask & HAS_VALUE) != 0) {
node.data = new Float(in.readFloat()); node.data = Long.valueOf(in.readLong());
} }
if ((mask & LO_KID) != 0) { if ((mask & LO_KID) != 0) {
TSTNode kid = trie.new TSTNode('\0', node); TSTNode kid = trie.new TSTNode('\0', node);
@ -171,7 +169,7 @@ public class JaspellLookup extends Lookup {
if (node.data != null) mask |= HAS_VALUE; if (node.data != null) mask |= HAS_VALUE;
out.writeByte(mask); out.writeByte(mask);
if (node.data != null) { if (node.data != null) {
out.writeFloat((Float)node.data); out.writeLong(((Number)node.data).longValue());
} }
writeRecursively(out, node.relatives[TSTNode.LOKID]); writeRecursively(out, node.relatives[TSTNode.LOKID]);
writeRecursively(out, node.relatives[TSTNode.EQKID]); writeRecursively(out, node.relatives[TSTNode.EQKID]);

View File

@ -50,26 +50,24 @@ public class TSTLookup extends Lookup {
} }
ArrayList<String> tokens = new ArrayList<String>(); ArrayList<String> tokens = new ArrayList<String>();
ArrayList<Float> vals = new ArrayList<Float>(); ArrayList<Number> vals = new ArrayList<Number>();
BytesRef spare; BytesRef spare;
CharsRef charsSpare = new CharsRef(); CharsRef charsSpare = new CharsRef();
while ((spare = tfit.next()) != null) { while ((spare = tfit.next()) != null) {
charsSpare.grow(spare.length); charsSpare.grow(spare.length);
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
tokens.add(charsSpare.toString()); tokens.add(charsSpare.toString());
vals.add(new Float(tfit.weight())); vals.add(Long.valueOf(tfit.weight()));
} }
autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root); autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
} }
@Override
public boolean add(CharSequence key, Object value) { public boolean add(CharSequence key, Object value) {
autocomplete.insert(root, key, value, 0); autocomplete.insert(root, key, value, 0);
// XXX we don't know if a new node was created // XXX we don't know if a new node was created
return true; return true;
} }
@Override
public Object get(CharSequence key) { public Object get(CharSequence key) {
List<TernaryTreeNode> list = autocomplete.prefixCompletion(root, key, 0); List<TernaryTreeNode> list = autocomplete.prefixCompletion(root, key, 0);
if (list == null || list.isEmpty()) { if (list == null || list.isEmpty()) {
@ -107,7 +105,7 @@ public class TSTLookup extends Lookup {
if (onlyMorePopular) { if (onlyMorePopular) {
LookupPriorityQueue queue = new LookupPriorityQueue(num); LookupPriorityQueue queue = new LookupPriorityQueue(num);
for (TernaryTreeNode ttn : list) { for (TernaryTreeNode ttn : list) {
queue.insertWithOverflow(new LookupResult(ttn.token, (Float)ttn.val)); queue.insertWithOverflow(new LookupResult(ttn.token, ((Number)ttn.val).longValue()));
} }
for (LookupResult lr : queue.getResults()) { for (LookupResult lr : queue.getResults()) {
res.add(lr); res.add(lr);
@ -115,7 +113,7 @@ public class TSTLookup extends Lookup {
} else { } else {
for (int i = 0; i < maxCnt; i++) { for (int i = 0; i < maxCnt; i++) {
TernaryTreeNode ttn = list.get(i); TernaryTreeNode ttn = list.get(i);
res.add(new LookupResult(ttn.token, (Float)ttn.val)); res.add(new LookupResult(ttn.token, ((Number)ttn.val).longValue()));
} }
} }
return res; return res;
@ -146,7 +144,7 @@ public class TSTLookup extends Lookup {
node.token = in.readUTF(); node.token = in.readUTF();
} }
if ((mask & HAS_VALUE) != 0) { if ((mask & HAS_VALUE) != 0) {
node.val = new Float(in.readFloat()); node.val = Long.valueOf(in.readLong());
} }
if ((mask & LO_KID) != 0) { if ((mask & LO_KID) != 0) {
node.loKid = new TernaryTreeNode(); node.loKid = new TernaryTreeNode();
@ -184,7 +182,7 @@ public class TSTLookup extends Lookup {
if (node.val != null) mask |= HAS_VALUE; if (node.val != null) mask |= HAS_VALUE;
out.writeByte(mask); out.writeByte(mask);
if (node.token != null) out.writeUTF(node.token); if (node.token != null) out.writeUTF(node.token);
if (node.val != null) out.writeFloat((Float)node.val); if (node.val != null) out.writeLong(((Number)node.val).longValue());
// recurse and write kids // recurse and write kids
if (node.loKid != null) { if (node.loKid != null) {
writeRecursively(out, node.loKid); writeRecursively(out, node.loKid);

View File

@ -17,8 +17,10 @@
package org.apache.lucene.search.suggest; package org.apache.lucene.search.suggest;
import java.io.File; import java.io.File;
import java.util.List;
import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup; import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
import org.apache.lucene.search.suggest.jaspell.JaspellLookup; import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
import org.apache.lucene.search.suggest.tst.TSTLookup; import org.apache.lucene.search.suggest.tst.TSTLookup;
@ -74,16 +76,18 @@ public class PersistenceTest extends LuceneTestCase {
lookup.load(storeDir); lookup.load(storeDir);
// Assert validity. // Assert validity.
float previous = Float.NEGATIVE_INFINITY; long previous = Long.MIN_VALUE;
for (TermFreq k : keys) { for (TermFreq k : keys) {
Float val = (Float) lookup.get(_TestUtil.bytesToCharSequence(k.term, random)); List<LookupResult> list = lookup.lookup(_TestUtil.bytesToCharSequence(k.term, random), false, 1);
assertNotNull(k.term.utf8ToString(), val); assertEquals(1, list.size());
LookupResult lookupResult = list.get(0);
assertNotNull(k.term.utf8ToString(), lookupResult.key);
if (supportsExactWeights) { if (supportsExactWeights) {
assertEquals(k.term.utf8ToString(), Float.valueOf(k.v), val); assertEquals(k.term.utf8ToString(), k.v, lookupResult.value);
} else { } else {
assertTrue(val + ">=" + previous, val >= previous); assertTrue(lookupResult.value + ">=" + previous, lookupResult.value >= previous);
previous = val.floatValue(); previous = lookupResult.value;
} }
} }
} }

View File

@ -29,59 +29,79 @@ import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil; import org.apache.lucene.util._TestUtil;
public class TestBytesRefList extends LuceneTestCase { public class TestBytesRefList extends LuceneTestCase {
public void testAppend() throws IOException { public void testAppend() throws IOException {
BytesRefList list = new BytesRefList(); BytesRefList list = new BytesRefList();
List<String> stringList = new ArrayList<String>(); List<String> stringList = new ArrayList<String>();
int entries = atLeast(500); for (int j = 0; j < 2; j++) {
BytesRef spare = new BytesRef(); if (j > 0 && random.nextBoolean()) {
for (int i = 0; i < entries; i++) { list.clear();
String randomRealisticUnicodeString = _TestUtil stringList.clear();
.randomRealisticUnicodeString(random); }
spare.copyChars(randomRealisticUnicodeString); int entries = atLeast(500);
list.append(spare); BytesRef spare = new BytesRef();
stringList.add(randomRealisticUnicodeString); for (int i = 0; i < entries; i++) {
} String randomRealisticUnicodeString = _TestUtil
for (int i = 0; i < entries; i++) { .randomRealisticUnicodeString(random);
assertNotNull(list.get(spare, i)); spare.copyChars(randomRealisticUnicodeString);
assertEquals("entry " + i + " doesn't match", stringList.get(i), list.append(spare);
spare.utf8ToString()); stringList.add(randomRealisticUnicodeString);
} }
for (int i = 0; i < entries; i++) {
// check random assertNotNull(list.get(spare, i));
for (int i = 0; i < entries; i++) { assertEquals("entry " + i + " doesn't match", stringList.get(i),
int e = random.nextInt(entries); spare.utf8ToString());
assertNotNull(list.get(spare, e)); }
assertEquals("entry " + i + " doesn't match", stringList.get(e),
spare.utf8ToString()); // check random
} for (int i = 0; i < entries; i++) {
for (int i = 0; i < 2; i++) { int e = random.nextInt(entries);
assertNotNull(list.get(spare, e));
BytesRefIterator iterator = list.iterator(); assertEquals("entry " + i + " doesn't match", stringList.get(e),
for (String string : stringList) { spare.utf8ToString());
assertEquals(string, iterator.next().utf8ToString()); }
for (int i = 0; i < 2; i++) {
BytesRefIterator iterator = list.iterator();
for (String string : stringList) {
assertEquals(string, iterator.next().utf8ToString());
}
} }
} }
} }
public void testSort() { public void testSort() throws IOException {
BytesRefList list = new BytesRefList(); BytesRefList list = new BytesRefList();
List<String> stringList = new ArrayList<String>(); List<String> stringList = new ArrayList<String>();
int entries = atLeast(500);
BytesRef spare = new BytesRef(); for (int j = 0; j < 2; j++) {
for (int i = 0; i < entries; i++) { if (j > 0 && random.nextBoolean()) {
String randomRealisticUnicodeString = _TestUtil.randomRealisticUnicodeString(random); list.clear();
spare.copyChars(randomRealisticUnicodeString); stringList.clear();
list.append(spare); }
stringList.add(randomRealisticUnicodeString); int entries = atLeast(500);
} BytesRef spare = new BytesRef();
Collections.sort(stringList); for (int i = 0; i < entries; i++) {
int[] sortedOrds = list.sort(BytesRef.getUTF8SortedAsUTF16Comparator()); String randomRealisticUnicodeString = _TestUtil
for (int i = 0; i < entries; i++) { .randomRealisticUnicodeString(random);
assertNotNull(list.get(spare, sortedOrds[i])); spare.copyChars(randomRealisticUnicodeString);
assertEquals("entry " + i + " doesn't match", stringList.get(i), list.append(spare);
spare.utf8ToString()); stringList.add(randomRealisticUnicodeString);
}
Collections.sort(stringList);
BytesRefIterator iter = list.iterator(BytesRef
.getUTF8SortedAsUTF16Comparator());
int i = 0;
while ((spare = iter.next()) != null) {
assertEquals("entry " + i + " doesn't match", stringList.get(i),
spare.utf8ToString());
i++;
}
assertNull(iter.next());
assertEquals(i, stringList.size());
} }
} }
} }

View File

@ -17,12 +17,16 @@ package org.apache.lucene.search.suggest;
* the License. * the License.
*/ */
import java.util.Comparator;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil; import org.apache.lucene.util._TestUtil;
@ -38,7 +42,8 @@ public class TestTermFreqIterator extends LuceneTestCase {
public void testTerms() throws Exception { public void testTerms() throws Exception {
int num = atLeast(10000); int num = atLeast(10000);
TreeMap<BytesRef,Long> sorted = new TreeMap<BytesRef,Long>(); Comparator<BytesRef> comparator = random.nextBoolean() ? BytesRef.getUTF8SortedAsUnicodeComparator() : BytesRef.getUTF8SortedAsUTF16Comparator();
TreeMap<BytesRef,Long> sorted = new TreeMap<BytesRef,Long>(comparator);
TermFreq[] unsorted = new TermFreq[num]; TermFreq[] unsorted = new TermFreq[num];
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
@ -52,13 +57,13 @@ public class TestTermFreqIterator extends LuceneTestCase {
} }
// test the sorted iterator wrapper // test the sorted iterator wrapper
TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), BytesRef.getUTF8SortedAsUnicodeComparator()); TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator);
Iterator<Map.Entry<BytesRef,Long>> expected = sorted.entrySet().iterator(); Iterator<Map.Entry<BytesRef,Long>> expected = sorted.entrySet().iterator();
while (expected.hasNext()) { while (expected.hasNext()) {
Map.Entry<BytesRef,Long> entry = expected.next(); Map.Entry<BytesRef,Long> entry = expected.next();
assertEquals(entry.getKey(), wrapper.next()); assertEquals(entry.getKey(), wrapper.next());
assertEquals(entry.getValue().longValue(), wrapper.weight(), 0F); assertEquals(entry.getValue().longValue(), wrapper.weight());
} }
assertNull(wrapper.next()); assertNull(wrapper.next());
@ -72,4 +77,57 @@ public class TestTermFreqIterator extends LuceneTestCase {
} }
assertEquals(sorted, actual); assertEquals(sorted, actual);
} }
public void testRaw() throws Exception {
int num = atLeast(10000);
Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUnicodeComparator();
BytesRefHash sorted = new BytesRefHash();
TermFreq[] unsorted = new TermFreq[num];
byte[] buffer = new byte[0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
for (int i = 0; i < num; i++) {
BytesRef spare;
long weight;
do {
spare = new BytesRef(_TestUtil.randomUnicodeString(random));
if (spare.length + 8 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, spare.length + 8);
}
output.reset(buffer);
output.writeBytes(spare.bytes, spare.offset, spare.length);
weight = random.nextLong();
output.writeLong(weight);
} while (sorted.add(new BytesRef(buffer, 0, output.getPosition())) < 0);
unsorted[i] = new TermFreq(spare, weight);
}
// test the sorted iterator wrapper
TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator, true);
int[] sort = sorted.sort(comparator);
int size = sorted.size();
BytesRef spare = new BytesRef();
for (int i = 0; i < size; i++) {
sorted.get(sort[i], spare);
spare.length -= 8; // sub the long value
assertEquals(spare, wrapper.next());
spare.offset = spare.offset + spare.length;
spare.length = 8;
assertEquals(asLong(spare), wrapper.weight());
}
assertNull(wrapper.next());
}
public static long asLong(BytesRef b) {
return (((long) asIntInternal(b, b.offset) << 32) | asIntInternal(b,
b.offset + 4) & 0xFFFFFFFFL);
}
private static int asIntInternal(BytesRef b, int pos) {
return ((b.bytes[pos++] & 0xFF) << 24) | ((b.bytes[pos++] & 0xFF) << 16)
| ((b.bytes[pos++] & 0xFF) << 8) | (b.bytes[pos] & 0xFF);
}
} }

View File

@ -17,9 +17,8 @@ package org.apache.lucene.search.suggest.fst;
* limitations under the License. * limitations under the License.
*/ */
import java.util.Iterator;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test; import org.junit.Test;
@ -31,7 +30,7 @@ public class BytesRefSortersTest extends LuceneTestCase {
@Test @Test
public void testInMemorySorter() throws Exception { public void testInMemorySorter() throws Exception {
check(new InMemorySorter()); check(new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator()));
} }
private void check(BytesRefSorter sorter) throws Exception { private void check(BytesRefSorter sorter) throws Exception {
@ -42,8 +41,8 @@ public class BytesRefSortersTest extends LuceneTestCase {
} }
// Create two iterators and check that they're aligned with each other. // Create two iterators and check that they're aligned with each other.
Iterator<BytesRef> i1 = sorter.iterator(); BytesRefIterator i1 = sorter.iterator();
Iterator<BytesRef> i2 = sorter.iterator(); BytesRefIterator i2 = sorter.iterator();
// Verify sorter contract. // Verify sorter contract.
try { try {
@ -52,10 +51,12 @@ public class BytesRefSortersTest extends LuceneTestCase {
} catch (IllegalStateException e) { } catch (IllegalStateException e) {
// Expected. // Expected.
} }
BytesRef spare1;
while (i1.hasNext() && i2.hasNext()) { BytesRef spare2;
assertEquals(i1.next(), i2.next()); while ((spare1 = i1.next()) != null && (spare2 = i2.next()) != null) {
assertEquals(spare1, spare2);
} }
assertEquals(i1.hasNext(), i2.hasNext()); assertNull(i1.next());
assertNull(i2.next());
} }
} }

View File

@ -165,9 +165,9 @@ public class FSTCompletionTest extends LuceneTestCase {
// All the weights were constant, so all returned buckets must be constant, whatever they // All the weights were constant, so all returned buckets must be constant, whatever they
// are. // are.
Float previous = null; Long previous = null;
for (TermFreq tf : keys) { for (TermFreq tf : keys) {
Float current = (Float)lookup.get(_TestUtil.bytesToCharSequence(tf.term, random)); Long current = ((Number)lookup.get(_TestUtil.bytesToCharSequence(tf.term, random))).longValue();
if (previous != null) { if (previous != null) {
assertEquals(previous, current); assertEquals(previous, current);
} }
@ -181,7 +181,7 @@ public class FSTCompletionTest extends LuceneTestCase {
FSTCompletionLookup lookup = new FSTCompletionLookup(); FSTCompletionLookup lookup = new FSTCompletionLookup();
lookup.build(new TermFreqArrayIterator(input)); lookup.build(new TermFreqArrayIterator(input));
for (TermFreq tf : input) { for (TermFreq tf : input) {
assertTrue("Not found: " + tf.term.toString(), lookup.get(_TestUtil.bytesToCharSequence(tf.term, random)) != null); assertNotNull("Not found: " + tf.term.toString(), lookup.get(_TestUtil.bytesToCharSequence(tf.term, random)));
assertEquals(tf.term.utf8ToString(), lookup.lookup(_TestUtil.bytesToCharSequence(tf.term, random), true, 1).get(0).key.toString()); assertEquals(tf.term.utf8ToString(), lookup.lookup(_TestUtil.bytesToCharSequence(tf.term, random), true, 1).get(0).key.toString());
} }

View File

@ -1,140 +0,0 @@
package org.apache.lucene.search.suggest.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.*;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.NumericUtils;
import org.junit.Ignore;
import org.junit.Test;
public class FloatMagicTest extends LuceneTestCase {
public void testFloatMagic() {
ArrayList<Float> floats = new ArrayList<Float>(Arrays.asList(
Float.intBitsToFloat(0x7f800001), // NaN (invalid combination).
Float.intBitsToFloat(0x7fffffff), // NaN (invalid combination).
Float.intBitsToFloat(0xff800001), // NaN (invalid combination).
Float.intBitsToFloat(0xffffffff), // NaN (invalid combination).
Float.POSITIVE_INFINITY,
Float.MAX_VALUE,
100f,
0f,
0.1f,
Float.MIN_VALUE,
Float.NaN,
-0.0f,
-Float.MIN_VALUE,
-0.1f,
-1f,
-10f,
Float.NEGATIVE_INFINITY));
// Sort them using juc.
Collections.sort(floats);
// Convert to sortable int4 representation (as long to have an unsigned sort).
long [] int4 = new long [floats.size()];
for (int i = 0; i < floats.size(); i++) {
int4[i] = FloatMagic.toSortable(floats.get(i)) & 0xffffffffL;
/*
System.out.println(
String.format("raw %8s sortable %8s %8s numutils %8s %s",
Integer.toHexString(Float.floatToRawIntBits(floats.get(i))),
Integer.toHexString(FloatMagic.toSortable(floats.get(i))),
Integer.toHexString(FloatMagic.unsignedOrderedToFloatBits(FloatMagic.toSortable(floats.get(i)))),
Integer.toHexString(NumericUtils.floatToSortableInt(floats.get(i))),
floats.get(i)));
*/
}
// Sort and compare. Should be identical order.
Arrays.sort(int4);
ArrayList<Float> backFromFixed = new ArrayList<Float>();
for (int i = 0; i < int4.length; i++) {
backFromFixed.add(FloatMagic.fromSortable((int) int4[i]));
}
/*
for (int i = 0; i < int4.length; i++) {
System.out.println(
floats.get(i) + " " + FloatMagic.fromSortable((int) int4[i]));
}
*/
assertEquals(floats, backFromFixed);
}
@Ignore("Once checked, valid forever?") @Test
public void testRoundTripFullRange() {
int i = 0;
do {
float f = Float.intBitsToFloat(i);
float f2 = FloatMagic.fromSortable(FloatMagic.toSortable(f));
if (!((Float.isNaN(f) && Float.isNaN(f2)) || f == f2)) {
throw new RuntimeException("! " + Integer.toHexString(i) + "> " + f + " " + f2);
}
if ((i & 0xffffff) == 0) {
System.out.println(Integer.toHexString(i));
}
i++;
} while (i != 0);
}
@Ignore("Once checked, valid forever?") @Test
public void testIncreasingFullRange() {
// -infinity ... -0.0
for (int i = 0xff800000; i != 0x80000000; i--) {
checkSmaller(i, i - 1);
}
// -0.0 +0.0
checkSmaller(0x80000000, 0);
// +0.0 ... +infinity
for (int i = 0; i != 0x7f800000; i++) {
checkSmaller(i, i + 1);
}
// All other are NaNs and should be after positive infinity.
final long infinity = toSortableL(Float.POSITIVE_INFINITY);
for (int i = 0x7f800001; i != 0x7fffffff; i++) {
assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
}
for (int i = 0xff800001; i != 0xffffffff; i++) {
assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
}
}
private long toSortableL(float f) {
return FloatMagic.toSortable(f) & 0xffffffffL;
}
private void checkSmaller(int i1, int i2) {
float f1 = Float.intBitsToFloat(i1);
float f2 = Float.intBitsToFloat(i2);
if (f1 > f2) {
throw new AssertionError(f1 + " " + f2 + " " + i1 + " " + i2);
}
assertTrue(toSortableL(f1) < toSortableL(f2));
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search.suggest.fst;
import java.io.*; import java.io.*;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Comparator;
import org.apache.lucene.search.suggest.fst.Sort.BufferSize; import org.apache.lucene.search.suggest.fst.Sort.BufferSize;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter; import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
@ -61,7 +62,7 @@ public class TestSort extends LuceneTestCase {
@Test @Test
public void testIntermediateMerges() throws Exception { public void testIntermediateMerges() throws Exception {
// Sort 20 mb worth of data with 1mb buffer, binary merging. // Sort 20 mb worth of data with 1mb buffer, binary merging.
SortInfo info = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), 2), SortInfo info = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), 2),
generateRandom(Sort.MB * 20)); generateRandom(Sort.MB * 20));
assertTrue(info.mergeRounds > 10); assertTrue(info.mergeRounds > 10);
} }
@ -69,7 +70,7 @@ public class TestSort extends LuceneTestCase {
@Test @Test
public void testSmallRandom() throws Exception { public void testSmallRandom() throws Exception {
// Sort 20 mb worth of data with 1mb buffer. // Sort 20 mb worth of data with 1mb buffer.
SortInfo sortInfo = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), SortInfo sortInfo = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
generateRandom(Sort.MB * 20)); generateRandom(Sort.MB * 20));
assertEquals(1, sortInfo.mergeRounds); assertEquals(1, sortInfo.mergeRounds);
} }
@ -77,7 +78,7 @@ public class TestSort extends LuceneTestCase {
@Test @Nightly @Test @Nightly
public void testLargerRandom() throws Exception { public void testLargerRandom() throws Exception {
// Sort 100MB worth of data with 15mb buffer. // Sort 100MB worth of data with 15mb buffer.
checkSort(new Sort(BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
generateRandom(Sort.MB * 100)); generateRandom(Sort.MB * 100));
} }
@ -92,14 +93,25 @@ public class TestSort extends LuceneTestCase {
byte [][] bytes = data.toArray(new byte[data.size()][]); byte [][] bytes = data.toArray(new byte[data.size()][]);
return bytes; return bytes;
} }
static final Comparator<byte[]> unsignedByteOrderComparator = new Comparator<byte[]>() {
public int compare(byte[] left, byte[] right) {
final int max = Math.min(left.length, right.length);
for (int i = 0, j = 0; i < max; i++, j++) {
int diff = (left[i] & 0xff) - (right[j] & 0xff);
if (diff != 0)
return diff;
}
return left.length - right.length;
}
};
/** /**
* Check sorting data on an instance of {@link Sort}. * Check sorting data on an instance of {@link Sort}.
*/ */
private SortInfo checkSort(Sort sort, byte[][] data) throws IOException { private SortInfo checkSort(Sort sort, byte[][] data) throws IOException {
File unsorted = writeAll("unsorted", data); File unsorted = writeAll("unsorted", data);
Arrays.sort(data, Sort.unsignedByteOrderComparator); Arrays.sort(data, unsignedByteOrderComparator);
File golden = writeAll("golden", data); File golden = writeAll("golden", data);
File sorted = new File(tempDir, "sorted"); File sorted = new File(tempDir, "sorted");

View File

@ -117,7 +117,7 @@ public class WFSTCompletionTest extends LuceneTestCase {
// TODO: could be faster... but its slowCompletor for a reason // TODO: could be faster... but its slowCompletor for a reason
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) { for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
if (e.getKey().startsWith(prefix)) { if (e.getKey().startsWith(prefix)) {
matches.add(new LookupResult(e.getKey(), (float)e.getValue().longValue())); matches.add(new LookupResult(e.getKey(), e.getValue().longValue()));
} }
} }

View File

@ -153,11 +153,6 @@ public class Suggester extends SolrSpellChecker {
build(core, searcher); build(core, searcher);
} }
public void add(CharsRef query, int numHits) {
LOG.info("add " + query + ", " + numHits);
lookup.add(query, new Integer(numHits));
}
static SpellingResult EMPTY_RESULT = new SpellingResult(); static SpellingResult EMPTY_RESULT = new SpellingResult();
@Override @Override
@ -182,7 +177,7 @@ public class Suggester extends SolrSpellChecker {
Collections.sort(suggestions); Collections.sort(suggestions);
} }
for (LookupResult lr : suggestions) { for (LookupResult lr : suggestions) {
res.add(t, lr.key.toString(), ((Number)lr.value).intValue()); res.add(t, lr.key.toString(), (int)lr.value);
} }
} }
return res; return res;