LUCENE-3807: Cleanup Suggest / Lookup API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1296268 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2012-03-02 15:59:55 +00:00
parent 2c94c522fd
commit f303bcd465
27 changed files with 692 additions and 627 deletions

View File

@ -115,6 +115,13 @@ Changes in backwards compatibility policy
* LUCENE-3626: The internal implementation classes in PKIndexSplitter
and MultiPassIndexSplitter were made private as they now work
per segment. (Uwe Schindler)
* LUCENE-3807: Cleaned up Suggest / Lookup API. Term weights (freqs) are now
64bit signed integers instead of 32bit floats. Sorting of terms is now a
disk based merge sort instead of an in-memory sort. The Lookup API now
accepts and returns CharSequence instead of String which should be converted
into a String before used in a datastructure that relies on hashCode / equals.
(Simon Willnauer)
Changes in Runtime Behavior

View File

@ -25,6 +25,7 @@ import org.apache.lucene.util.BytesRef;
/**
* This wrapper buffers incoming elements.
* @lucene.experimental
*/
public class BufferingTermFreqIteratorWrapper implements TermFreqIterator {
// TODO keep this for now

View File

@ -18,81 +18,113 @@ package org.apache.lucene.search.suggest;
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.SorterTemplate;
final class BytesRefList {
/**
* A simple append only random-access {@link BytesRef} array that stores full
* copies of the appended bytes in a {@link ByteBlockPool}.
*
*
* <b>Note: This class is not Thread-Safe!</b>
*
* @lucene.internal
* @lucene.experimental
*/
public final class BytesRefList {
// TODO rename to BytesRefArray
private final ByteBlockPool pool;
private int[] offsets = new int[1];
private int currentElement = 0;
private int lastElement = 0;
private int currentOffset = 0;
private final Counter bytesUsed = Counter.newCounter(false);
/**
* Creates a new {@link BytesRefList}
*/
public BytesRefList() {
this(new ByteBlockPool(new ByteBlockPool.DirectAllocator()));
}
public BytesRefList(ByteBlockPool pool) {
this.pool = pool;
this.pool = new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(
bytesUsed));
pool.nextBuffer();
bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER
+ RamUsageEstimator.NUM_BYTES_INT);
}
public int append(BytesRef bytes) {
if (currentElement >= offsets.length) {
offsets = ArrayUtil.grow(offsets, offsets.length + 1);
}
pool.copy(bytes);
offsets[currentElement++] = currentOffset;
currentOffset += bytes.length;
return currentElement;
}
public int size() {
return currentElement;
}
public BytesRef get(BytesRef bytes, int pos) {
if (currentElement > pos) {
bytes.offset = offsets[pos];
bytes.length = pos == currentElement - 1 ? currentOffset - bytes.offset
: offsets[pos + 1] - bytes.offset;
pool.copyFrom(bytes);
return bytes;
}
throw new IndexOutOfBoundsException("index " + pos
+ " must be less than the size: " + currentElement);
}
public BytesRefIterator iterator() {
final int numElements = currentElement;
return new BytesRefIterator() {
private final BytesRef spare = new BytesRef();
private int pos = 0;
@Override
public BytesRef next() throws IOException {
if (pos < numElements) {
get(spare, pos++);
return spare;
}
return null;
}
@Override
public Comparator<BytesRef> getComparator() {
return null;
}
};
/**
* Clears this {@link BytesRefList}
*/
public void clear() {
lastElement = 0;
currentOffset = 0;
Arrays.fill(offsets, 0);
pool.reset();
}
public int[] sort(final Comparator<BytesRef> comp) {
/**
* Appends a copy of the given {@link BytesRef} to this {@link BytesRefList}.
* @param bytes the bytes to append
* @return the ordinal of the appended bytes
*/
public int append(BytesRef bytes) {
if (lastElement >= offsets.length) {
int oldLen = offsets.length;
offsets = ArrayUtil.grow(offsets, offsets.length + 1);
bytesUsed.addAndGet((offsets.length - oldLen)
* RamUsageEstimator.NUM_BYTES_INT);
}
pool.copy(bytes);
offsets[lastElement++] = currentOffset;
currentOffset += bytes.length;
return lastElement;
}
/**
* Returns the current size of this {@link BytesRefList}
* @return the current size of this {@link BytesRefList}
*/
public int size() {
return lastElement;
}
/**
* Returns the <i>n'th</i> element of this {@link BytesRefList}
* @param spare a spare {@link BytesRef} instance
* @param ord the elements ordinal to retrieve
* @return the <i>n'th</i> element of this {@link BytesRefList}
*/
public BytesRef get(BytesRef spare, int ord) {
if (lastElement > ord) {
spare.offset = offsets[ord];
spare.length = ord == lastElement - 1 ? currentOffset - spare.offset
: offsets[ord + 1] - spare.offset;
pool.copyFrom(spare);
return spare;
}
throw new IndexOutOfBoundsException("index " + ord
+ " must be less than the size: " + lastElement);
}
/**
* Returns the number internally used bytes to hold the appended bytes in
* memory
*
* @return the number internally used bytes to hold the appended bytes in
* memory
*/
public long bytesUsed() {
return bytesUsed.get();
}
private int[] sort(final Comparator<BytesRef> comp) {
final int[] orderdEntries = new int[size()];
for (int i = 0; i < orderdEntries.length; i++) {
orderdEntries[i] = i;
@ -110,22 +142,65 @@ final class BytesRefList {
final int ord1 = orderdEntries[i], ord2 = orderdEntries[j];
return comp.compare(get(scratch1, ord1), get(scratch2, ord2));
}
@Override
protected void setPivot(int i) {
final int ord = orderdEntries[i];
get(pivot, ord);
}
@Override
protected int comparePivot(int j) {
final int ord = orderdEntries[j];
return comp.compare(pivot, get(scratch2, ord));
}
private final BytesRef pivot = new BytesRef(),
scratch1 = new BytesRef(), scratch2 = new BytesRef();
private final BytesRef pivot = new BytesRef(), scratch1 = new BytesRef(),
scratch2 = new BytesRef();
}.quickSort(0, size() - 1);
return orderdEntries;
}
/**
* sugar for {@link #iterator(Comparator)} with a <code>null</code> comparator
*/
public BytesRefIterator iterator() {
return iterator(null);
}
/**
* <p>
* Returns a {@link BytesRefIterator} with point in time semantics. The
* iterator provides access to all so far appended {@link BytesRef} instances.
* </p>
* <p>
* If a non <code>null</code> {@link Comparator} is provided the iterator will
* iterate the byte values in the order specified by the comparator. Otherwise
* the order is the same as the values were appended.
* </p>
* <p>
* This is a non-destructive operation.
* </p>
*/
public BytesRefIterator iterator(final Comparator<BytesRef> comp) {
final BytesRef spare = new BytesRef();
final int size = size();
final int[] ords = comp == null ? null : sort(comp);
return new BytesRefIterator() {
int pos = 0;
@Override
public BytesRef next() throws IOException {
if (pos < size) {
return get(spare, ords == null ? pos++ : ords[pos++]);
}
return null;
}
@Override
public Comparator<BytesRef> getComparator() {
return comp;
}
};
}
}

View File

@ -75,7 +75,11 @@ public class FileDictionary implements Dictionary {
String[] fields = line.split("\t");
if (fields.length > 1) {
// keep reading floats for bw compat
curFreq = (int)Float.parseFloat(fields[1]);
try {
curFreq = Long.parseLong(fields[1]);
} catch (NumberFormatException e) {
curFreq = (long)Double.parseDouble(fields[1]);
}
spare.copyChars(fields[0]);
} else {
spare.copyChars(line);

View File

@ -29,15 +29,19 @@ import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.PriorityQueue;
/**
* Simple Lookup interface for {@link CharSequence} suggestions.
* @lucene.experimental
*/
public abstract class Lookup {
/**
* Result of a lookup.
*/
public static final class LookupResult implements Comparable<LookupResult> {
public final CharSequence key;
public final float value;
public final long value;
public LookupResult(CharSequence key, float value) {
public LookupResult(CharSequence key, long value) {
this.key = key;
this.value = value;
}
@ -112,6 +116,10 @@ public abstract class Lookup {
build(tfit);
}
/**
* Builds up a new internal {@link Lookup} representation based on the given {@link TermFreqIterator}.
* The implementation might re-sort the data internally.
*/
public abstract void build(TermFreqIterator tfit) throws IOException;
/**
@ -124,22 +132,7 @@ public abstract class Lookup {
*/
public abstract List<LookupResult> lookup(CharSequence key, boolean onlyMorePopular, int num);
/**
* Modify the lookup data by recording additional data. Optional operation.
* @param key new lookup key
* @param value value to associate with this key
* @return true if new key is added, false if it already exists or operation
* is not supported.
*/
public abstract boolean add(CharSequence key, Object value);
/**
* Get value associated with a specific key.
* @param key lookup key
* @return associated value
*/
public abstract Object get(CharSequence key);
/**
* Persist the constructed lookup data to a directory. Optional operation.
* @param output {@link OutputStream} to write the data to.
@ -173,4 +166,5 @@ public abstract class Lookup {
* @throws IOException when fatal IO error occurs.
*/
public abstract boolean load(File storeDir) throws IOException;
}

View File

@ -17,45 +17,166 @@ package org.apache.lucene.search.suggest;
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.fst.Sort;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/**
* This wrapper buffers incoming elements and makes sure they are sorted in
* ascending lexicographic order.
* This wrapper buffers incoming elements and makes sure they are sorted based on given comparator.
* @lucene.experimental
*/
public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper {
// TODO keep this for now - but the consumer should really sort this stuff on disk with sorter...
private final int[] sortedOrds;
private int currentOrd = -1;
private final BytesRef spare = new BytesRef();
private final Comparator<BytesRef> comp;
public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comp) throws IOException {
super(source);
this.sortedOrds = entries.sort(comp);
this.comp = comp;
public class SortedTermFreqIteratorWrapper implements TermFreqIterator {
private final TermFreqIterator source;
private File tempInput;
private File tempSorted;
private final ByteSequencesReader reader;
private boolean done = false;
private long weight;
private final BytesRef scratch = new BytesRef();
private final Comparator<BytesRef> comparator;
public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator) throws IOException {
this(source, comparator, false);
}
@Override
public long weight() {
return freqs[currentOrd];
public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comparator, boolean compareRawBytes) throws IOException {
this.source = source;
this.comparator = comparator;
this.reader = sort(compareRawBytes ? comparator : new BytesOnlyComparator(this.comparator));
}
@Override
public BytesRef next() throws IOException {
if (++curPos < entries.size()) {
return entries.get(spare, (currentOrd = sortedOrds[curPos]));
boolean success = false;
if (done) {
return null;
}
try {
ByteArrayDataInput input = new ByteArrayDataInput();
if (reader.read(scratch)) {
weight = decode(scratch, input);
success = true;
return scratch;
}
close();
success = done = true;
return null;
} finally {
if (!success) {
done = true;
close();
}
}
return null;
}
@Override
public Comparator<BytesRef> getComparator() {
return comp;
return comparator;
}
@Override
public long weight() {
return weight;
}
private Sort.ByteSequencesReader sort(Comparator<BytesRef> comparator) throws IOException {
String prefix = getClass().getSimpleName();
File directory = Sort.defaultTempDir();
tempInput = File.createTempFile(prefix, ".input", directory);
tempSorted = File.createTempFile(prefix, ".sorted", directory);
final Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
boolean success = false;
try {
BytesRef spare;
byte[] buffer = new byte[0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
while ((spare = source.next()) != null) {
encode(writer, output, buffer, spare, source.weight());
}
writer.close();
new Sort(comparator).sort(tempInput, tempSorted);
ByteSequencesReader reader = new Sort.ByteSequencesReader(tempSorted);
success = true;
return reader;
} finally {
if (success) {
IOUtils.close(writer);
} else {
try {
IOUtils.closeWhileHandlingException(writer);
} finally {
close();
}
}
}
}
private void close() throws IOException {
if (tempInput != null) {
tempInput.delete();
}
if (tempSorted != null) {
tempSorted.delete();
}
IOUtils.close(reader);
}
private final static class BytesOnlyComparator implements Comparator<BytesRef> {
final Comparator<BytesRef> other;
private final BytesRef leftScratch = new BytesRef();
private final BytesRef rightScratch = new BytesRef();
public BytesOnlyComparator(Comparator<BytesRef> other) {
this.other = other;
}
@Override
public int compare(BytesRef left, BytesRef right) {
wrap(leftScratch, left);
wrap(rightScratch, right);
return other.compare(leftScratch, rightScratch);
}
private void wrap(BytesRef wrapper, BytesRef source) {
wrapper.bytes = source.bytes;
wrapper.offset = source.offset;
wrapper.length = source.length - 8;
}
}
protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException {
if (spare.length + 8 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, spare.length + 8);
}
output.reset(buffer);
output.writeBytes(spare.bytes, spare.offset, spare.length);
output.writeLong(weight);
writer.write(buffer, 0, output.getPosition());
}
protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
tmpInput.reset(scratch.bytes);
tmpInput.skipBytes(scratch.length - 8); // suggestion + separator
scratch.length -= 8; // sep + long
return tmpInput.readLong();
}
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.util.BytesRef;
/**
* This wrapper buffers the incoming elements and makes sure they are in
* random order.
* @lucene.experimental
*/
public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper {
// TODO keep this for now

View File

@ -18,13 +18,16 @@ package org.apache.lucene.search.suggest.fst;
*/
import java.io.IOException;
import java.util.Iterator;
import java.util.Comparator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
/**
* Collects {@link BytesRef} and then allows one to iterate over their sorted order. Implementations
* of this interface will be called in a single-threaded scenario.
* of this interface will be called in a single-threaded scenario.
* @lucene.experimental
* @lucene.internal
*/
public interface BytesRefSorter {
/**
@ -42,5 +45,7 @@ public interface BytesRefSorter {
*
* @throws IOException If an I/O exception occurs.
*/
Iterator<BytesRef> iterator() throws IOException;
BytesRefIterator iterator() throws IOException;
Comparator<BytesRef> getComparator();
}

View File

@ -18,59 +18,63 @@ package org.apache.lucene.search.suggest.fst;
*/
import java.io.*;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Comparator;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IOUtils;
/**
* Builds and iterates over sequences stored on disk.
* @lucene.experimental
* @lucene.internal
*/
public class ExternalRefSorter implements BytesRefSorter, Closeable {
private final Sort sort;
private Sort.ByteSequencesWriter writer;
private File input;
private File sorted;
private File sorted;
/**
* Will buffer all sequences to a temporary file and then sort (all on-disk).
*/
public ExternalRefSorter(Sort sort) throws IOException {
this.sort = sort;
this.input = File.createTempFile("RefSorter-", ".raw", Sort.defaultTempDir());
this.input = File.createTempFile("RefSorter-", ".raw",
Sort.defaultTempDir());
this.writer = new Sort.ByteSequencesWriter(input);
}
@Override
public void add(BytesRef utf8) throws IOException {
if (writer == null)
throw new IllegalStateException();
if (writer == null) throw new IllegalStateException();
writer.write(utf8);
}
@Override
public Iterator<BytesRef> iterator() throws IOException {
public BytesRefIterator iterator() throws IOException {
if (sorted == null) {
closeWriter();
sorted = File.createTempFile("RefSorter-", ".sorted", Sort.defaultTempDir());
sorted = File.createTempFile("RefSorter-", ".sorted",
Sort.defaultTempDir());
sort.sort(input, sorted);
input.delete();
input = null;
}
return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted));
return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted),
sort.getComparator());
}
private void closeWriter() throws IOException {
if (writer != null) {
writer.close();
writer = null;
}
}
/**
* Removes any written temporary files.
*/
@ -83,40 +87,54 @@ public class ExternalRefSorter implements BytesRefSorter, Closeable {
if (sorted != null) sorted.delete();
}
}
/**
* Iterate over byte refs in a file.
*/
class ByteSequenceIterator implements Iterator<BytesRef> {
private ByteSequencesReader reader;
private byte[] next;
public ByteSequenceIterator(ByteSequencesReader reader) throws IOException {
class ByteSequenceIterator implements BytesRefIterator {
private final ByteSequencesReader reader;
private BytesRef scratch = new BytesRef();
private final Comparator<BytesRef> comparator;
public ByteSequenceIterator(ByteSequencesReader reader,
Comparator<BytesRef> comparator) {
this.reader = reader;
this.next = reader.read();
}
@Override
public boolean hasNext() {
return next != null;
this.comparator = comparator;
}
@Override
public BytesRef next() {
if (next == null) throw new NoSuchElementException();
BytesRef r = new BytesRef(next);
try {
next = reader.read();
if (next == null) {
reader.close();
}
} catch (IOException e) {
throw new RuntimeException(e);
public BytesRef next() throws IOException {
if (scratch == null) {
return null;
}
boolean success = false;
try {
byte[] next = reader.read();
if (next != null) {
scratch.bytes = next;
scratch.length = next.length;
scratch.offset = 0;
} else {
IOUtils.close(reader);
scratch = null;
}
success = true;
return scratch;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(reader);
}
}
return r;
}
@Override
public void remove() { throw new UnsupportedOperationException(); }
public Comparator<BytesRef> getComparator() {
return comparator;
}
}
@Override
public Comparator<BytesRef> getComparator() {
return sort.getComparator();
}
}

View File

@ -28,6 +28,7 @@ import org.apache.lucene.util.fst.FST.Arc;
* Finite state automata based implementation of "autocomplete" functionality.
*
* @see FSTCompletionBuilder
* @lucene.experimental
*/
// TODO: we could store exact weights as outputs from the FST (int4 encoded
@ -159,10 +160,10 @@ public class FSTCompletion {
* @param utf8
* The sequence of utf8 bytes to follow.
*
* @return Returns the bucket number of the match or <code>null</code> if no
* @return Returns the bucket number of the match or <code>-1</code> if no
* match was found.
*/
private Integer getExactMatchStartingFromRootArc(
private int getExactMatchStartingFromRootArc(
int rootArcIndex, BytesRef utf8) {
// Get the UTF-8 bytes representation of the input key.
try {
@ -186,7 +187,7 @@ public class FSTCompletion {
}
// No match.
return null;
return -1;
}
/**
@ -273,8 +274,8 @@ public class FSTCompletion {
// exact match, if requested.
if (exactFirst) {
if (!checkExistingAndReorder(res, key)) {
Integer exactMatchBucket = getExactMatchStartingFromRootArc(i, key);
if (exactMatchBucket != null) {
int exactMatchBucket = getExactMatchStartingFromRootArc(i, key);
if (exactMatchBucket != -1) {
// Insert as the first result and truncate at num.
while (res.size() >= num) {
res.remove(res.size() - 1);
@ -385,10 +386,10 @@ public class FSTCompletion {
}
/**
* Returns the bucket assigned to a given key (if found) or <code>null</code> if
* Returns the bucket assigned to a given key (if found) or <code>-1</code> if
* no exact match exists.
*/
public Integer getBucket(CharSequence key) {
public int getBucket(CharSequence key) {
return getExactMatchStartingFromRootArc(0, new BytesRef(key));
}

View File

@ -19,9 +19,9 @@ package org.apache.lucene.search.suggest.fst;
import java.io.Closeable;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.*;
@ -98,6 +98,7 @@ import org.apache.lucene.util.fst.*;
* change, requiring you to rebuild the FST suggest index.
*
* @see FSTCompletion
* @lucene.experimental
*/
public class FSTCompletionBuilder {
/**
@ -143,10 +144,11 @@ public class FSTCompletionBuilder {
/**
* Creates an {@link FSTCompletion} with default options: 10 buckets, exact match
* promoted to first position and {@link InMemorySorter}.
* promoted to first position and {@link InMemorySorter} with a comparator obtained from
* {@link BytesRef#getUTF8SortedAsUnicodeComparator()}.
*/
public FSTCompletionBuilder() {
this(DEFAULT_BUCKETS, new InMemorySorter(), Integer.MAX_VALUE);
this(DEFAULT_BUCKETS, new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator()), Integer.MAX_VALUE);
}
/**
@ -237,10 +239,12 @@ public class FSTCompletionBuilder {
shareMaxTailLength, outputs, null, false);
BytesRef scratch = new BytesRef();
BytesRef entry;
final IntsRef scratchIntsRef = new IntsRef();
int count = 0;
for (Iterator<BytesRef> i = sorter.iterator(); i.hasNext(); count++) {
BytesRef entry = i.next();
BytesRefIterator iter = sorter.iterator();
while((entry = iter.next()) != null) {
count++;
if (scratch.compareTo(entry) != 0) {
builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
scratch.copyBytes(entry);

View File

@ -59,6 +59,7 @@ import org.apache.lucene.util.fst.NoOutputs;
* use {@link FSTCompletion} directly or {@link TSTLookup}, for example.
*
* @see FSTCompletion
* @lucene.experimental
*/
public class FSTCompletionLookup extends Lookup {
/**
@ -171,7 +172,7 @@ public class FSTCompletionLookup extends Lookup {
}
output.reset(buffer);
output.writeInt(FloatMagic.toSortable(tfit.weight()));
output.writeInt(encodeWeight(tfit.weight()));
output.writeBytes(spare.bytes, spare.offset, spare.length);
writer.write(buffer, 0, output.getPosition());
}
@ -188,13 +189,13 @@ public class FSTCompletionLookup extends Lookup {
reader = new Sort.ByteSequencesReader(tempSorted);
long line = 0;
int previousBucket = 0;
float previousScore = 0;
int previousScore = 0;
ByteArrayDataInput input = new ByteArrayDataInput();
BytesRef tmp1 = new BytesRef();
BytesRef tmp2 = new BytesRef();
while (reader.read(tmp1)) {
input.reset(tmp1.bytes);
float currentScore = FloatMagic.fromSortable(input.readInt());
int currentScore = input.readInt();
int bucket;
if (line > 0 && currentScore == previousScore) {
@ -230,6 +231,14 @@ public class FSTCompletionLookup extends Lookup {
tempSorted.delete();
}
}
/** weight -> cost */
private static int encodeWeight(long value) {
if (value < Integer.MIN_VALUE || value > Integer.MAX_VALUE) {
throw new UnsupportedOperationException("cannot encode value: " + value);
}
return (int)value;
}
@Override
public List<LookupResult> lookup(CharSequence key, boolean higherWeightsFirst, int num) {
@ -250,19 +259,9 @@ public class FSTCompletionLookup extends Lookup {
return results;
}
@Override
public boolean add(CharSequence key, Object value) {
// Not supported.
return false;
}
@Override
public Object get(CharSequence key) {
Integer bucket = normalCompletion.getBucket(key);
if (bucket == null)
return null;
else
return (float) normalCompletion.getBucket(key) / normalCompletion.getBucketCount();
final int bucket = normalCompletion.getBucket(key);
return bucket == -1 ? null : Long.valueOf(bucket);
}
/**

View File

@ -1,75 +0,0 @@
package org.apache.lucene.search.suggest.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.NumericUtils;
/**
* Converts normalized float representations ({@link Float#floatToIntBits(float)})
* into integers that are directly sortable in int4 representation (or unsigned values or
* after promoting to a long with higher 32-bits zeroed).
*/
class FloatMagic {
/**
* Convert a float to a directly sortable unsigned integer. For sortable signed
* integers, see {@link NumericUtils#floatToSortableInt(float)}.
*/
public static int toSortable(float f) {
return floatBitsToUnsignedOrdered(Float.floatToRawIntBits(f));
}
/**
* Back from {@link #toSortable(float)} to float.
*/
public static float fromSortable(int v) {
return Float.intBitsToFloat(unsignedOrderedToFloatBits(v));
}
/**
* Convert float bits to directly sortable bits.
* Normalizes all NaNs to canonical form.
*/
static int floatBitsToUnsignedOrdered(int v) {
// Canonicalize NaN ranges. I assume this check will be faster here than
// (v == v) == false on the FPU? We don't distinguish between different
// flavors of NaNs here (see http://en.wikipedia.org/wiki/NaN). I guess
// in Java this doesn't matter much anyway.
if ((v & 0x7fffffff) > 0x7f800000) {
// Apply the logic below to a canonical "quiet NaN"
return 0x7fc00000 ^ 0x80000000;
}
if (v < 0) {
// Reverse the order of negative values and push them before positive values.
return ~v;
} else {
// Shift positive values after negative, but before NaNs, they're sorted already.
return v ^ 0x80000000;
}
}
/**
* Back from {@link #floatBitsToUnsignedOrdered(int)}.
*/
static int unsignedOrderedToFloatBits(int v) {
if (v < 0)
return v & ~0x80000000;
else
return ~v;
}
}

View File

@ -17,29 +17,40 @@ package org.apache.lucene.search.suggest.fst;
* limitations under the License.
*/
import java.util.*;
import java.util.Comparator;
import org.apache.lucene.search.suggest.BytesRefList;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
/**
* An {@link BytesRefSorter} that keeps all the entries in memory.
* @lucene.experimental
* @lucene.internal
*/
public final class InMemorySorter implements BytesRefSorter {
// TODO: use a single byte[] to back up all entries?
private final ArrayList<BytesRef> refs = new ArrayList<BytesRef>();
private final BytesRefList buffer = new BytesRefList();
private boolean closed = false;
private final Comparator<BytesRef> comparator;
public InMemorySorter(Comparator<BytesRef> comparator) {
this.comparator = comparator;
}
@Override
public void add(BytesRef utf8) {
if (closed) throw new IllegalStateException();
refs.add(BytesRef.deepCopyOf(utf8));
buffer.append(utf8);
}
@Override
public Iterator<BytesRef> iterator() {
public BytesRefIterator iterator() {
closed = true;
Collections.sort(refs, BytesRef.getUTF8SortedAsUnicodeComparator());
return Collections.unmodifiableCollection(refs).iterator();
return buffer.iterator(comparator);
}
@Override
public Comparator<BytesRef> getComparator() {
return comparator;
}
}

View File

@ -20,15 +20,10 @@ package org.apache.lucene.search.suggest.fst;
import java.io.*;
import java.util.*;
import org.apache.lucene.search.suggest.BytesRefList;
import org.apache.lucene.util.*;
import org.apache.lucene.util.PriorityQueue;
// TODO: the buffer is currently byte[][] which with very small arrays will terribly overallocate
// memory (alignments) and make GC very happy.
//
// We could move it to a single byte[] + and use custom sorting, but we'd need to check if this
// yields any improvement first.
/**
* On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following
* fields:
@ -38,6 +33,8 @@ import org.apache.lucene.util.PriorityQueue;
* </ul>
*
* @see #sort(File, File)
* @lucene.experimental
* @lucene.internal
*/
public final class Sort {
public final static int MB = 1024 * 1024;
@ -59,11 +56,6 @@ public final class Sort {
*/
public final static int MAX_TEMPFILES = 128;
/**
* Minimum slot buffer expansion.
*/
private final static int MIN_EXPECTED_GROWTH = 1000;
/**
* A bit more descriptive unit for constructors.
*
@ -111,21 +103,6 @@ public final class Sort {
}
}
/**
* byte[] in unsigned byte order.
*/
static final Comparator<byte[]> unsignedByteOrderComparator = new Comparator<byte[]>() {
public int compare(byte[] left, byte[] right) {
final int max = Math.min(left.length, right.length);
for (int i = 0, j = 0; i < max; i++, j++) {
int diff = (left[i] & 0xff) - (right[j] & 0xff);
if (diff != 0)
return diff;
}
return left.length - right.length;
}
};
/**
* Sort info (debugging mostly).
*/
@ -149,14 +126,15 @@ public final class Sort {
}
}
private final static byte [][] EMPTY = new byte [0][];
private final BufferSize ramBufferSize;
private final File tempDirectory;
private byte [][] buffer = new byte [0][];
private final BytesRefList buffer = new BytesRefList();
private SortInfo sortInfo;
private int maxTempFiles;
private final Comparator<BytesRef> comparator;
public static final Comparator<BytesRef> DEFAULT_COMPARATOR = BytesRef.getUTF8SortedAsUnicodeComparator();
/**
* Defaults constructor.
@ -165,13 +143,17 @@ public final class Sort {
* @see BufferSize#automatic()
*/
public Sort() throws IOException {
this(BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
this(DEFAULT_COMPARATOR, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
}
public Sort(Comparator<BytesRef> comparator) throws IOException {
this(comparator, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES);
}
/**
* All-details constructor.
*/
public Sort(BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) {
public Sort(Comparator<BytesRef> comparator, BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) {
if (ramBufferSize.bytes < ABSOLUTE_MIN_SORT_BUFFER_SIZE) {
throw new IllegalArgumentException(MIN_BUFFER_SIZE_MSG + ": " + ramBufferSize.bytes);
}
@ -183,6 +165,7 @@ public final class Sort {
this.ramBufferSize = ramBufferSize;
this.tempDirectory = tempDirectory;
this.maxTempFiles = maxTempfiles;
this.comparator = comparator;
}
/**
@ -283,23 +266,25 @@ public final class Sort {
/** Sort a single partition in-memory. */
protected File sortPartition(int len) throws IOException {
byte [][] data = this.buffer;
BytesRefList data = this.buffer;
File tempFile = File.createTempFile("sort", "partition", tempDirectory);
long start = System.currentTimeMillis();
Arrays.sort(data, 0, len, unsignedByteOrderComparator);
sortInfo.sortTime += (System.currentTimeMillis() - start);
ByteSequencesWriter out = new ByteSequencesWriter(tempFile);
final ByteSequencesWriter out = new ByteSequencesWriter(tempFile);
BytesRef spare;
try {
for (int i = 0; i < len; i++) {
assert data[i].length <= Short.MAX_VALUE;
out.write(data[i]);
BytesRefIterator iter = buffer.iterator(comparator);
while((spare = iter.next()) != null) {
assert spare.length <= Short.MAX_VALUE;
out.write(spare);
}
out.close();
// Clean up the buffer for the next partition.
this.buffer = EMPTY;
data.clear();
return tempFile;
} finally {
IOUtils.close(out);
@ -314,7 +299,7 @@ public final class Sort {
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(merges.size()) {
protected boolean lessThan(FileAndTop a, FileAndTop b) {
return a.current.compareTo(b.current) < 0;
return comparator.compare(a.current, b.current) < 0;
}
};
@ -359,33 +344,18 @@ public final class Sort {
/** Read in a single partition of data */
int readPartition(ByteSequencesReader reader) throws IOException {
long start = System.currentTimeMillis();
// We will be reallocating from scratch.
Arrays.fill(this.buffer, null);
int bytesLimit = this.ramBufferSize.bytes;
byte [][] data = this.buffer;
byte[] line;
int linesRead = 0;
while ((line = reader.read()) != null) {
if (linesRead + 1 >= data.length) {
data = Arrays.copyOf(data,
ArrayUtil.oversize(linesRead + MIN_EXPECTED_GROWTH,
RamUsageEstimator.NUM_BYTES_OBJECT_REF));
}
data[linesRead++] = line;
final BytesRef scratch = new BytesRef();
while ((scratch.bytes = reader.read()) != null) {
scratch.length = scratch.bytes.length;
buffer.append(scratch);
// Account for the created objects.
// (buffer slots do not account to buffer size.)
bytesLimit -= line.length + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER;
if (bytesLimit < 0) {
if (ramBufferSize.bytes < buffer.bytesUsed()) {
break;
}
}
this.buffer = data;
sortInfo.readTime += (System.currentTimeMillis() - start);
return linesRead;
return buffer.size();
}
static class FileAndTop {
@ -515,5 +485,9 @@ public final class Sort {
((Closeable) is).close();
}
}
}
public Comparator<BytesRef> getComparator() {
return comparator;
}
}

View File

@ -28,6 +28,8 @@ import java.util.List;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.InputStreamDataInput;
@ -102,72 +104,27 @@ public class WFSTCompletionLookup extends Lookup {
@Override
public void build(TermFreqIterator iterator) throws IOException {
String prefix = getClass().getSimpleName();
File directory = Sort.defaultTempDir();
File tempInput = File.createTempFile(prefix, ".input", directory);
File tempSorted = File.createTempFile(prefix, ".sorted", directory);
Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
Sort.ByteSequencesReader reader = null;
BytesRef scratch = new BytesRef();
boolean success = false;
try {
byte [] buffer = new byte [0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
BytesRef spare;
while ((spare = iterator.next()) != null) {
if (spare.length + 5 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, spare.length + 5);
}
output.reset(buffer);
output.writeBytes(spare.bytes, spare.offset, spare.length);
output.writeByte((byte)0); // separator: not used, just for sort order
output.writeInt((int)encodeWeight(iterator.weight()));
writer.write(buffer, 0, output.getPosition());
}
writer.close();
new Sort().sort(tempInput, tempSorted);
reader = new Sort.ByteSequencesReader(tempSorted);
TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator,
BytesRef.getUTF8SortedAsUnicodeComparator());
IntsRef scratchInts = new IntsRef();
BytesRef previous = null;
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
while ((scratch = iter.next()) != null) {
long cost = iter.weight();
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
BytesRef previous = null;
BytesRef suggestion = new BytesRef();
IntsRef scratchInts = new IntsRef();
ByteArrayDataInput input = new ByteArrayDataInput();
while (reader.read(scratch)) {
suggestion.bytes = scratch.bytes;
suggestion.offset = scratch.offset;
suggestion.length = scratch.length - 5; // int + separator
input.reset(scratch.bytes);
input.skipBytes(suggestion.length + 1); // suggestion + separator
long cost = input.readInt();
if (previous == null) {
previous = new BytesRef();
} else if (suggestion.equals(previous)) {
continue; // for duplicate suggestions, the best weight is actually added
}
Util.toIntsRef(suggestion, scratchInts);
builder.add(scratchInts, cost);
previous.copyBytes(suggestion);
if (previous == null) {
previous = new BytesRef();
} else if (scratch.equals(previous)) {
continue; // for duplicate suggestions, the best weight is actually
// added
}
fst = builder.finish();
success = true;
} finally {
if (success) {
IOUtils.close(reader, writer);
} else {
IOUtils.closeWhileHandlingException(reader, writer);
}
tempInput.delete();
tempSorted.delete();
Util.toIntsRef(scratch, scratchInts);
builder.add(scratchInts, cost);
previous.copyBytes(scratch);
}
fst = builder.finish();
}
@Override
@ -270,16 +227,10 @@ public class WFSTCompletionLookup extends Lookup {
return output;
}
@Override
public boolean add(CharSequence key, Object value) {
return false; // Not supported.
}
/**
* Returns the weight associated with an input string,
* or null if it does not exist.
*/
@Override
public Object get(CharSequence key) {
Arc<Long> arc = new Arc<Long>();
Long result = null;
@ -289,23 +240,51 @@ public class WFSTCompletionLookup extends Lookup {
if (result == null || !arc.isFinal()) {
return null;
} else {
return decodeWeight(result + arc.nextFinalOutput);
return Integer.valueOf(decodeWeight(result + arc.nextFinalOutput));
}
}
/** cost -> weight */
private static float decodeWeight(long encoded) {
return Integer.MAX_VALUE - encoded;
private static int decodeWeight(long encoded) {
return (int)(Integer.MAX_VALUE - encoded);
}
/** weight -> cost */
private static long encodeWeight(float value) {
if (Float.isNaN(value) || Float.isInfinite(value) || value < 0 || value > Integer.MAX_VALUE) {
private static int encodeWeight(long value) {
if (value < 0 || value > Integer.MAX_VALUE) {
throw new UnsupportedOperationException("cannot encode value: " + value);
}
return Integer.MAX_VALUE - (int)value;
}
private final class WFSTTermFreqIteratorWrapper extends SortedTermFreqIteratorWrapper {
WFSTTermFreqIteratorWrapper(TermFreqIterator source,
Comparator<BytesRef> comparator) throws IOException {
super(source, comparator, true);
}
@Override
protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException {
if (spare.length + 5 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, spare.length + 5);
}
output.reset(buffer);
output.writeBytes(spare.bytes, spare.offset, spare.length);
output.writeByte((byte)0); // separator: not used, just for sort order
output.writeInt(encodeWeight(weight));
writer.write(buffer, 0, output.getPosition());
}
@Override
protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) {
tmpInput.reset(scratch.bytes);
tmpInput.skipBytes(scratch.length - 4); // suggestion + separator
scratch.length -= 5; // sep + long
return tmpInput.readInt();
}
}
static final Comparator<Long> weightComparator = new Comparator<Long> () {
public int compare(Long left, Long right) {
return left.compareTo(right);

View File

@ -55,24 +55,22 @@ public class JaspellLookup extends Lookup {
final CharsRef charsSpare = new CharsRef();
while ((spare = tfit.next()) != null) {
float freq = tfit.weight();
final long weight = tfit.weight();
if (spare.length == 0) {
continue;
}
charsSpare.grow(spare.length);
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
trie.put(charsSpare.toString(), new Float(freq));
trie.put(charsSpare.toString(), Long.valueOf(weight));
}
}
@Override
public boolean add(CharSequence key, Object value) {
trie.put(key, value);
// XXX
return false;
}
@Override
public Object get(CharSequence key) {
return trie.get(key);
}
@ -95,7 +93,7 @@ public class JaspellLookup extends Lookup {
if (onlyMorePopular) {
LookupPriorityQueue queue = new LookupPriorityQueue(num);
for (String s : list) {
float freq = (Float)trie.get(s);
long freq = ((Number)trie.get(s)).longValue();
queue.insertWithOverflow(new LookupResult(new CharsRef(s), freq));
}
for (LookupResult lr : queue.getResults()) {
@ -104,7 +102,7 @@ public class JaspellLookup extends Lookup {
} else {
for (int i = 0; i < maxCnt; i++) {
String s = list.get(i);
float freq = (Float)trie.get(s);
long freq = ((Number)trie.get(s)).longValue();
res.add(new LookupResult(new CharsRef(s), freq));
}
}
@ -131,7 +129,7 @@ public class JaspellLookup extends Lookup {
node.splitchar = in.readChar();
byte mask = in.readByte();
if ((mask & HAS_VALUE) != 0) {
node.data = new Float(in.readFloat());
node.data = Long.valueOf(in.readLong());
}
if ((mask & LO_KID) != 0) {
TSTNode kid = trie.new TSTNode('\0', node);
@ -171,7 +169,7 @@ public class JaspellLookup extends Lookup {
if (node.data != null) mask |= HAS_VALUE;
out.writeByte(mask);
if (node.data != null) {
out.writeFloat((Float)node.data);
out.writeLong(((Number)node.data).longValue());
}
writeRecursively(out, node.relatives[TSTNode.LOKID]);
writeRecursively(out, node.relatives[TSTNode.EQKID]);

View File

@ -50,26 +50,24 @@ public class TSTLookup extends Lookup {
}
ArrayList<String> tokens = new ArrayList<String>();
ArrayList<Float> vals = new ArrayList<Float>();
ArrayList<Number> vals = new ArrayList<Number>();
BytesRef spare;
CharsRef charsSpare = new CharsRef();
while ((spare = tfit.next()) != null) {
charsSpare.grow(spare.length);
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
tokens.add(charsSpare.toString());
vals.add(new Float(tfit.weight()));
vals.add(Long.valueOf(tfit.weight()));
}
autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
}
@Override
public boolean add(CharSequence key, Object value) {
autocomplete.insert(root, key, value, 0);
// XXX we don't know if a new node was created
return true;
}
@Override
public Object get(CharSequence key) {
List<TernaryTreeNode> list = autocomplete.prefixCompletion(root, key, 0);
if (list == null || list.isEmpty()) {
@ -107,7 +105,7 @@ public class TSTLookup extends Lookup {
if (onlyMorePopular) {
LookupPriorityQueue queue = new LookupPriorityQueue(num);
for (TernaryTreeNode ttn : list) {
queue.insertWithOverflow(new LookupResult(ttn.token, (Float)ttn.val));
queue.insertWithOverflow(new LookupResult(ttn.token, ((Number)ttn.val).longValue()));
}
for (LookupResult lr : queue.getResults()) {
res.add(lr);
@ -115,7 +113,7 @@ public class TSTLookup extends Lookup {
} else {
for (int i = 0; i < maxCnt; i++) {
TernaryTreeNode ttn = list.get(i);
res.add(new LookupResult(ttn.token, (Float)ttn.val));
res.add(new LookupResult(ttn.token, ((Number)ttn.val).longValue()));
}
}
return res;
@ -146,7 +144,7 @@ public class TSTLookup extends Lookup {
node.token = in.readUTF();
}
if ((mask & HAS_VALUE) != 0) {
node.val = new Float(in.readFloat());
node.val = Long.valueOf(in.readLong());
}
if ((mask & LO_KID) != 0) {
node.loKid = new TernaryTreeNode();
@ -184,7 +182,7 @@ public class TSTLookup extends Lookup {
if (node.val != null) mask |= HAS_VALUE;
out.writeByte(mask);
if (node.token != null) out.writeUTF(node.token);
if (node.val != null) out.writeFloat((Float)node.val);
if (node.val != null) out.writeLong(((Number)node.val).longValue());
// recurse and write kids
if (node.loKid != null) {
writeRecursively(out, node.loKid);

View File

@ -17,8 +17,10 @@
package org.apache.lucene.search.suggest;
import java.io.File;
import java.util.List;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.fst.FSTCompletionLookup;
import org.apache.lucene.search.suggest.jaspell.JaspellLookup;
import org.apache.lucene.search.suggest.tst.TSTLookup;
@ -74,16 +76,18 @@ public class PersistenceTest extends LuceneTestCase {
lookup.load(storeDir);
// Assert validity.
float previous = Float.NEGATIVE_INFINITY;
long previous = Long.MIN_VALUE;
for (TermFreq k : keys) {
Float val = (Float) lookup.get(_TestUtil.bytesToCharSequence(k.term, random));
assertNotNull(k.term.utf8ToString(), val);
List<LookupResult> list = lookup.lookup(_TestUtil.bytesToCharSequence(k.term, random), false, 1);
assertEquals(1, list.size());
LookupResult lookupResult = list.get(0);
assertNotNull(k.term.utf8ToString(), lookupResult.key);
if (supportsExactWeights) {
assertEquals(k.term.utf8ToString(), Float.valueOf(k.v), val);
assertEquals(k.term.utf8ToString(), k.v, lookupResult.value);
} else {
assertTrue(val + ">=" + previous, val >= previous);
previous = val.floatValue();
assertTrue(lookupResult.value + ">=" + previous, lookupResult.value >= previous);
previous = lookupResult.value;
}
}
}

View File

@ -29,59 +29,79 @@ import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
public class TestBytesRefList extends LuceneTestCase {
public void testAppend() throws IOException {
BytesRefList list = new BytesRefList();
List<String> stringList = new ArrayList<String>();
int entries = atLeast(500);
BytesRef spare = new BytesRef();
for (int i = 0; i < entries; i++) {
String randomRealisticUnicodeString = _TestUtil
.randomRealisticUnicodeString(random);
spare.copyChars(randomRealisticUnicodeString);
list.append(spare);
stringList.add(randomRealisticUnicodeString);
}
for (int i = 0; i < entries; i++) {
assertNotNull(list.get(spare, i));
assertEquals("entry " + i + " doesn't match", stringList.get(i),
spare.utf8ToString());
}
// check random
for (int i = 0; i < entries; i++) {
int e = random.nextInt(entries);
assertNotNull(list.get(spare, e));
assertEquals("entry " + i + " doesn't match", stringList.get(e),
spare.utf8ToString());
}
for (int i = 0; i < 2; i++) {
BytesRefIterator iterator = list.iterator();
for (String string : stringList) {
assertEquals(string, iterator.next().utf8ToString());
for (int j = 0; j < 2; j++) {
if (j > 0 && random.nextBoolean()) {
list.clear();
stringList.clear();
}
int entries = atLeast(500);
BytesRef spare = new BytesRef();
for (int i = 0; i < entries; i++) {
String randomRealisticUnicodeString = _TestUtil
.randomRealisticUnicodeString(random);
spare.copyChars(randomRealisticUnicodeString);
list.append(spare);
stringList.add(randomRealisticUnicodeString);
}
for (int i = 0; i < entries; i++) {
assertNotNull(list.get(spare, i));
assertEquals("entry " + i + " doesn't match", stringList.get(i),
spare.utf8ToString());
}
// check random
for (int i = 0; i < entries; i++) {
int e = random.nextInt(entries);
assertNotNull(list.get(spare, e));
assertEquals("entry " + i + " doesn't match", stringList.get(e),
spare.utf8ToString());
}
for (int i = 0; i < 2; i++) {
BytesRefIterator iterator = list.iterator();
for (String string : stringList) {
assertEquals(string, iterator.next().utf8ToString());
}
}
}
}
public void testSort() {
public void testSort() throws IOException {
BytesRefList list = new BytesRefList();
List<String> stringList = new ArrayList<String>();
int entries = atLeast(500);
BytesRef spare = new BytesRef();
for (int i = 0; i < entries; i++) {
String randomRealisticUnicodeString = _TestUtil.randomRealisticUnicodeString(random);
spare.copyChars(randomRealisticUnicodeString);
list.append(spare);
stringList.add(randomRealisticUnicodeString);
}
Collections.sort(stringList);
int[] sortedOrds = list.sort(BytesRef.getUTF8SortedAsUTF16Comparator());
for (int i = 0; i < entries; i++) {
assertNotNull(list.get(spare, sortedOrds[i]));
assertEquals("entry " + i + " doesn't match", stringList.get(i),
spare.utf8ToString());
for (int j = 0; j < 2; j++) {
if (j > 0 && random.nextBoolean()) {
list.clear();
stringList.clear();
}
int entries = atLeast(500);
BytesRef spare = new BytesRef();
for (int i = 0; i < entries; i++) {
String randomRealisticUnicodeString = _TestUtil
.randomRealisticUnicodeString(random);
spare.copyChars(randomRealisticUnicodeString);
list.append(spare);
stringList.add(randomRealisticUnicodeString);
}
Collections.sort(stringList);
BytesRefIterator iter = list.iterator(BytesRef
.getUTF8SortedAsUTF16Comparator());
int i = 0;
while ((spare = iter.next()) != null) {
assertEquals("entry " + i + " doesn't match", stringList.get(i),
spare.utf8ToString());
i++;
}
assertNull(iter.next());
assertEquals(i, stringList.size());
}
}
}

View File

@ -17,12 +17,16 @@ package org.apache.lucene.search.suggest;
* the License.
*/
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
@ -38,7 +42,8 @@ public class TestTermFreqIterator extends LuceneTestCase {
public void testTerms() throws Exception {
int num = atLeast(10000);
TreeMap<BytesRef,Long> sorted = new TreeMap<BytesRef,Long>();
Comparator<BytesRef> comparator = random.nextBoolean() ? BytesRef.getUTF8SortedAsUnicodeComparator() : BytesRef.getUTF8SortedAsUTF16Comparator();
TreeMap<BytesRef,Long> sorted = new TreeMap<BytesRef,Long>(comparator);
TermFreq[] unsorted = new TermFreq[num];
for (int i = 0; i < num; i++) {
@ -52,13 +57,13 @@ public class TestTermFreqIterator extends LuceneTestCase {
}
// test the sorted iterator wrapper
TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), BytesRef.getUTF8SortedAsUnicodeComparator());
TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator);
Iterator<Map.Entry<BytesRef,Long>> expected = sorted.entrySet().iterator();
while (expected.hasNext()) {
Map.Entry<BytesRef,Long> entry = expected.next();
assertEquals(entry.getKey(), wrapper.next());
assertEquals(entry.getValue().longValue(), wrapper.weight(), 0F);
assertEquals(entry.getValue().longValue(), wrapper.weight());
}
assertNull(wrapper.next());
@ -72,4 +77,57 @@ public class TestTermFreqIterator extends LuceneTestCase {
}
assertEquals(sorted, actual);
}
public void testRaw() throws Exception {
int num = atLeast(10000);
Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUnicodeComparator();
BytesRefHash sorted = new BytesRefHash();
TermFreq[] unsorted = new TermFreq[num];
byte[] buffer = new byte[0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
for (int i = 0; i < num; i++) {
BytesRef spare;
long weight;
do {
spare = new BytesRef(_TestUtil.randomUnicodeString(random));
if (spare.length + 8 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, spare.length + 8);
}
output.reset(buffer);
output.writeBytes(spare.bytes, spare.offset, spare.length);
weight = random.nextLong();
output.writeLong(weight);
} while (sorted.add(new BytesRef(buffer, 0, output.getPosition())) < 0);
unsorted[i] = new TermFreq(spare, weight);
}
// test the sorted iterator wrapper
TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator, true);
int[] sort = sorted.sort(comparator);
int size = sorted.size();
BytesRef spare = new BytesRef();
for (int i = 0; i < size; i++) {
sorted.get(sort[i], spare);
spare.length -= 8; // sub the long value
assertEquals(spare, wrapper.next());
spare.offset = spare.offset + spare.length;
spare.length = 8;
assertEquals(asLong(spare), wrapper.weight());
}
assertNull(wrapper.next());
}
public static long asLong(BytesRef b) {
return (((long) asIntInternal(b, b.offset) << 32) | asIntInternal(b,
b.offset + 4) & 0xFFFFFFFFL);
}
private static int asIntInternal(BytesRef b, int pos) {
return ((b.bytes[pos++] & 0xFF) << 24) | ((b.bytes[pos++] & 0xFF) << 16)
| ((b.bytes[pos++] & 0xFF) << 8) | (b.bytes[pos] & 0xFF);
}
}

View File

@ -17,9 +17,8 @@ package org.apache.lucene.search.suggest.fst;
* limitations under the License.
*/
import java.util.Iterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
@ -31,7 +30,7 @@ public class BytesRefSortersTest extends LuceneTestCase {
@Test
public void testInMemorySorter() throws Exception {
check(new InMemorySorter());
check(new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator()));
}
private void check(BytesRefSorter sorter) throws Exception {
@ -42,8 +41,8 @@ public class BytesRefSortersTest extends LuceneTestCase {
}
// Create two iterators and check that they're aligned with each other.
Iterator<BytesRef> i1 = sorter.iterator();
Iterator<BytesRef> i2 = sorter.iterator();
BytesRefIterator i1 = sorter.iterator();
BytesRefIterator i2 = sorter.iterator();
// Verify sorter contract.
try {
@ -52,10 +51,12 @@ public class BytesRefSortersTest extends LuceneTestCase {
} catch (IllegalStateException e) {
// Expected.
}
while (i1.hasNext() && i2.hasNext()) {
assertEquals(i1.next(), i2.next());
BytesRef spare1;
BytesRef spare2;
while ((spare1 = i1.next()) != null && (spare2 = i2.next()) != null) {
assertEquals(spare1, spare2);
}
assertEquals(i1.hasNext(), i2.hasNext());
assertNull(i1.next());
assertNull(i2.next());
}
}

View File

@ -165,9 +165,9 @@ public class FSTCompletionTest extends LuceneTestCase {
// All the weights were constant, so all returned buckets must be constant, whatever they
// are.
Float previous = null;
Long previous = null;
for (TermFreq tf : keys) {
Float current = (Float)lookup.get(_TestUtil.bytesToCharSequence(tf.term, random));
Long current = ((Number)lookup.get(_TestUtil.bytesToCharSequence(tf.term, random))).longValue();
if (previous != null) {
assertEquals(previous, current);
}
@ -181,7 +181,7 @@ public class FSTCompletionTest extends LuceneTestCase {
FSTCompletionLookup lookup = new FSTCompletionLookup();
lookup.build(new TermFreqArrayIterator(input));
for (TermFreq tf : input) {
assertTrue("Not found: " + tf.term.toString(), lookup.get(_TestUtil.bytesToCharSequence(tf.term, random)) != null);
assertNotNull("Not found: " + tf.term.toString(), lookup.get(_TestUtil.bytesToCharSequence(tf.term, random)));
assertEquals(tf.term.utf8ToString(), lookup.lookup(_TestUtil.bytesToCharSequence(tf.term, random), true, 1).get(0).key.toString());
}

View File

@ -1,140 +0,0 @@
package org.apache.lucene.search.suggest.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.*;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.NumericUtils;
import org.junit.Ignore;
import org.junit.Test;
public class FloatMagicTest extends LuceneTestCase {
public void testFloatMagic() {
ArrayList<Float> floats = new ArrayList<Float>(Arrays.asList(
Float.intBitsToFloat(0x7f800001), // NaN (invalid combination).
Float.intBitsToFloat(0x7fffffff), // NaN (invalid combination).
Float.intBitsToFloat(0xff800001), // NaN (invalid combination).
Float.intBitsToFloat(0xffffffff), // NaN (invalid combination).
Float.POSITIVE_INFINITY,
Float.MAX_VALUE,
100f,
0f,
0.1f,
Float.MIN_VALUE,
Float.NaN,
-0.0f,
-Float.MIN_VALUE,
-0.1f,
-1f,
-10f,
Float.NEGATIVE_INFINITY));
// Sort them using juc.
Collections.sort(floats);
// Convert to sortable int4 representation (as long to have an unsigned sort).
long [] int4 = new long [floats.size()];
for (int i = 0; i < floats.size(); i++) {
int4[i] = FloatMagic.toSortable(floats.get(i)) & 0xffffffffL;
/*
System.out.println(
String.format("raw %8s sortable %8s %8s numutils %8s %s",
Integer.toHexString(Float.floatToRawIntBits(floats.get(i))),
Integer.toHexString(FloatMagic.toSortable(floats.get(i))),
Integer.toHexString(FloatMagic.unsignedOrderedToFloatBits(FloatMagic.toSortable(floats.get(i)))),
Integer.toHexString(NumericUtils.floatToSortableInt(floats.get(i))),
floats.get(i)));
*/
}
// Sort and compare. Should be identical order.
Arrays.sort(int4);
ArrayList<Float> backFromFixed = new ArrayList<Float>();
for (int i = 0; i < int4.length; i++) {
backFromFixed.add(FloatMagic.fromSortable((int) int4[i]));
}
/*
for (int i = 0; i < int4.length; i++) {
System.out.println(
floats.get(i) + " " + FloatMagic.fromSortable((int) int4[i]));
}
*/
assertEquals(floats, backFromFixed);
}
@Ignore("Once checked, valid forever?") @Test
public void testRoundTripFullRange() {
int i = 0;
do {
float f = Float.intBitsToFloat(i);
float f2 = FloatMagic.fromSortable(FloatMagic.toSortable(f));
if (!((Float.isNaN(f) && Float.isNaN(f2)) || f == f2)) {
throw new RuntimeException("! " + Integer.toHexString(i) + "> " + f + " " + f2);
}
if ((i & 0xffffff) == 0) {
System.out.println(Integer.toHexString(i));
}
i++;
} while (i != 0);
}
@Ignore("Once checked, valid forever?") @Test
public void testIncreasingFullRange() {
// -infinity ... -0.0
for (int i = 0xff800000; i != 0x80000000; i--) {
checkSmaller(i, i - 1);
}
// -0.0 +0.0
checkSmaller(0x80000000, 0);
// +0.0 ... +infinity
for (int i = 0; i != 0x7f800000; i++) {
checkSmaller(i, i + 1);
}
// All other are NaNs and should be after positive infinity.
final long infinity = toSortableL(Float.POSITIVE_INFINITY);
for (int i = 0x7f800001; i != 0x7fffffff; i++) {
assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
}
for (int i = 0xff800001; i != 0xffffffff; i++) {
assertTrue(infinity < toSortableL(Float.intBitsToFloat(i)));
}
}
private long toSortableL(float f) {
return FloatMagic.toSortable(f) & 0xffffffffL;
}
private void checkSmaller(int i1, int i2) {
float f1 = Float.intBitsToFloat(i1);
float f2 = Float.intBitsToFloat(i2);
if (f1 > f2) {
throw new AssertionError(f1 + " " + f2 + " " + i1 + " " + i2);
}
assertTrue(toSortableL(f1) < toSortableL(f2));
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search.suggest.fst;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import org.apache.lucene.search.suggest.fst.Sort.BufferSize;
import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter;
@ -61,7 +62,7 @@ public class TestSort extends LuceneTestCase {
@Test
public void testIntermediateMerges() throws Exception {
// Sort 20 mb worth of data with 1mb buffer, binary merging.
SortInfo info = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), 2),
SortInfo info = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), 2),
generateRandom(Sort.MB * 20));
assertTrue(info.mergeRounds > 10);
}
@ -69,7 +70,7 @@ public class TestSort extends LuceneTestCase {
@Test
public void testSmallRandom() throws Exception {
// Sort 20 mb worth of data with 1mb buffer.
SortInfo sortInfo = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
SortInfo sortInfo = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
generateRandom(Sort.MB * 20));
assertEquals(1, sortInfo.mergeRounds);
}
@ -77,7 +78,7 @@ public class TestSort extends LuceneTestCase {
@Test @Nightly
public void testLargerRandom() throws Exception {
// Sort 100MB worth of data with 15mb buffer.
checkSort(new Sort(BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES),
generateRandom(Sort.MB * 100));
}
@ -92,14 +93,25 @@ public class TestSort extends LuceneTestCase {
byte [][] bytes = data.toArray(new byte[data.size()][]);
return bytes;
}
static final Comparator<byte[]> unsignedByteOrderComparator = new Comparator<byte[]>() {
public int compare(byte[] left, byte[] right) {
final int max = Math.min(left.length, right.length);
for (int i = 0, j = 0; i < max; i++, j++) {
int diff = (left[i] & 0xff) - (right[j] & 0xff);
if (diff != 0)
return diff;
}
return left.length - right.length;
}
};
/**
* Check sorting data on an instance of {@link Sort}.
*/
private SortInfo checkSort(Sort sort, byte[][] data) throws IOException {
File unsorted = writeAll("unsorted", data);
Arrays.sort(data, Sort.unsignedByteOrderComparator);
Arrays.sort(data, unsignedByteOrderComparator);
File golden = writeAll("golden", data);
File sorted = new File(tempDir, "sorted");

View File

@ -117,7 +117,7 @@ public class WFSTCompletionTest extends LuceneTestCase {
// TODO: could be faster... but its slowCompletor for a reason
for (Map.Entry<String,Long> e : slowCompletor.entrySet()) {
if (e.getKey().startsWith(prefix)) {
matches.add(new LookupResult(e.getKey(), (float)e.getValue().longValue()));
matches.add(new LookupResult(e.getKey(), e.getValue().longValue()));
}
}

View File

@ -153,11 +153,6 @@ public class Suggester extends SolrSpellChecker {
build(core, searcher);
}
public void add(CharsRef query, int numHits) {
LOG.info("add " + query + ", " + numHits);
lookup.add(query, new Integer(numHits));
}
static SpellingResult EMPTY_RESULT = new SpellingResult();
@Override
@ -182,7 +177,7 @@ public class Suggester extends SolrSpellChecker {
Collections.sort(suggestions);
}
for (LookupResult lr : suggestions) {
res.add(t, lr.key.toString(), ((Number)lr.value).intValue());
res.add(t, lr.key.toString(), (int)lr.value);
}
}
return res;