diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 3bd082ce3cb..e39c8134289 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -115,6 +115,13 @@ Changes in backwards compatibility policy * LUCENE-3626: The internal implementation classes in PKIndexSplitter and MultiPassIndexSplitter were made private as they now work per segment. (Uwe Schindler) + + * LUCENE-3807: Cleaned up Suggest / Lookup API. Term weights (freqs) are now + 64bit signed integers instead of 32bit floats. Sorting of terms is now a + disk based merge sort instead of an in-memory sort. The Lookup API now + accepts and returns CharSequence instead of String which should be converted + into a String before used in a datastructure that relies on hashCode / equals. + (Simon Willnauer) Changes in Runtime Behavior diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java index 12c89e06b53..dd6a86bffd1 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java @@ -25,6 +25,7 @@ import org.apache.lucene.util.BytesRef; /** * This wrapper buffers incoming elements. + * @lucene.experimental */ public class BufferingTermFreqIteratorWrapper implements TermFreqIterator { // TODO keep this for now diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/BytesRefList.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/BytesRefList.java index 8925cca908d..d5ce9eff9c0 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/BytesRefList.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/BytesRefList.java @@ -18,81 +18,113 @@ package org.apache.lucene.search.suggest; */ import java.io.IOException; +import java.util.Arrays; import java.util.Comparator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.Counter; +import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.SorterTemplate; -final class BytesRefList { - +/** + * A simple append only random-access {@link BytesRef} array that stores full + * copies of the appended bytes in a {@link ByteBlockPool}. + * + * + * Note: This class is not Thread-Safe! + * + * @lucene.internal + * @lucene.experimental + */ +public final class BytesRefList { + // TODO rename to BytesRefArray private final ByteBlockPool pool; private int[] offsets = new int[1]; - private int currentElement = 0; + private int lastElement = 0; private int currentOffset = 0; - + private final Counter bytesUsed = Counter.newCounter(false); + + /** + * Creates a new {@link BytesRefList} + */ public BytesRefList() { - this(new ByteBlockPool(new ByteBlockPool.DirectAllocator())); - } - - public BytesRefList(ByteBlockPool pool) { - this.pool = pool; + this.pool = new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator( + bytesUsed)); pool.nextBuffer(); + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + + RamUsageEstimator.NUM_BYTES_INT); } - - public int append(BytesRef bytes) { - if (currentElement >= offsets.length) { - offsets = ArrayUtil.grow(offsets, offsets.length + 1); - } - pool.copy(bytes); - offsets[currentElement++] = currentOffset; - currentOffset += bytes.length; - return currentElement; - } - - public int size() { - return currentElement; - } - - public BytesRef get(BytesRef bytes, int pos) { - if (currentElement > pos) { - bytes.offset = offsets[pos]; - bytes.length = pos == currentElement - 1 ? currentOffset - bytes.offset - : offsets[pos + 1] - bytes.offset; - pool.copyFrom(bytes); - return bytes; - } - throw new IndexOutOfBoundsException("index " + pos - + " must be less than the size: " + currentElement); - - } - - public BytesRefIterator iterator() { - final int numElements = currentElement; - - return new BytesRefIterator() { - private final BytesRef spare = new BytesRef(); - private int pos = 0; - - @Override - public BytesRef next() throws IOException { - if (pos < numElements) { - get(spare, pos++); - return spare; - } - return null; - } - - @Override - public Comparator getComparator() { - return null; - } - }; + + /** + * Clears this {@link BytesRefList} + */ + public void clear() { + lastElement = 0; + currentOffset = 0; + Arrays.fill(offsets, 0); + pool.reset(); } - public int[] sort(final Comparator comp) { + /** + * Appends a copy of the given {@link BytesRef} to this {@link BytesRefList}. + * @param bytes the bytes to append + * @return the ordinal of the appended bytes + */ + public int append(BytesRef bytes) { + if (lastElement >= offsets.length) { + int oldLen = offsets.length; + offsets = ArrayUtil.grow(offsets, offsets.length + 1); + bytesUsed.addAndGet((offsets.length - oldLen) + * RamUsageEstimator.NUM_BYTES_INT); + } + pool.copy(bytes); + offsets[lastElement++] = currentOffset; + currentOffset += bytes.length; + return lastElement; + } + + /** + * Returns the current size of this {@link BytesRefList} + * @return the current size of this {@link BytesRefList} + */ + public int size() { + return lastElement; + } + + /** + * Returns the n'th element of this {@link BytesRefList} + * @param spare a spare {@link BytesRef} instance + * @param ord the elements ordinal to retrieve + * @return the n'th element of this {@link BytesRefList} + */ + public BytesRef get(BytesRef spare, int ord) { + if (lastElement > ord) { + spare.offset = offsets[ord]; + spare.length = ord == lastElement - 1 ? currentOffset - spare.offset + : offsets[ord + 1] - spare.offset; + pool.copyFrom(spare); + return spare; + } + throw new IndexOutOfBoundsException("index " + ord + + " must be less than the size: " + lastElement); + + } + + /** + * Returns the number internally used bytes to hold the appended bytes in + * memory + * + * @return the number internally used bytes to hold the appended bytes in + * memory + */ + public long bytesUsed() { + return bytesUsed.get(); + } + + private int[] sort(final Comparator comp) { final int[] orderdEntries = new int[size()]; for (int i = 0; i < orderdEntries.length; i++) { orderdEntries[i] = i; @@ -110,22 +142,65 @@ final class BytesRefList { final int ord1 = orderdEntries[i], ord2 = orderdEntries[j]; return comp.compare(get(scratch1, ord1), get(scratch2, ord2)); } - + @Override protected void setPivot(int i) { final int ord = orderdEntries[i]; get(pivot, ord); } - + @Override protected int comparePivot(int j) { final int ord = orderdEntries[j]; return comp.compare(pivot, get(scratch2, ord)); } - private final BytesRef pivot = new BytesRef(), - scratch1 = new BytesRef(), scratch2 = new BytesRef(); + private final BytesRef pivot = new BytesRef(), scratch1 = new BytesRef(), + scratch2 = new BytesRef(); }.quickSort(0, size() - 1); return orderdEntries; } + + /** + * sugar for {@link #iterator(Comparator)} with a null comparator + */ + public BytesRefIterator iterator() { + return iterator(null); + } + + /** + *

+ * Returns a {@link BytesRefIterator} with point in time semantics. The + * iterator provides access to all so far appended {@link BytesRef} instances. + *

+ *

+ * If a non null {@link Comparator} is provided the iterator will + * iterate the byte values in the order specified by the comparator. Otherwise + * the order is the same as the values were appended. + *

+ *

+ * This is a non-destructive operation. + *

+ */ + public BytesRefIterator iterator(final Comparator comp) { + final BytesRef spare = new BytesRef(); + final int size = size(); + final int[] ords = comp == null ? null : sort(comp); + return new BytesRefIterator() { + int pos = 0; + + @Override + public BytesRef next() throws IOException { + if (pos < size) { + return get(spare, ords == null ? pos++ : ords[pos++]); + } + return null; + } + + @Override + public Comparator getComparator() { + return comp; + } + }; + } } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java index 059e1c23601..49f9f762f72 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java @@ -75,7 +75,11 @@ public class FileDictionary implements Dictionary { String[] fields = line.split("\t"); if (fields.length > 1) { // keep reading floats for bw compat - curFreq = (int)Float.parseFloat(fields[1]); + try { + curFreq = Long.parseLong(fields[1]); + } catch (NumberFormatException e) { + curFreq = (long)Double.parseDouble(fields[1]); + } spare.copyChars(fields[0]); } else { spare.copyChars(line); diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java index eff3ee7b594..f6abab61e2f 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java @@ -29,15 +29,19 @@ import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.PriorityQueue; +/** + * Simple Lookup interface for {@link CharSequence} suggestions. + * @lucene.experimental + */ public abstract class Lookup { /** * Result of a lookup. */ public static final class LookupResult implements Comparable { public final CharSequence key; - public final float value; + public final long value; - public LookupResult(CharSequence key, float value) { + public LookupResult(CharSequence key, long value) { this.key = key; this.value = value; } @@ -112,6 +116,10 @@ public abstract class Lookup { build(tfit); } + /** + * Builds up a new internal {@link Lookup} representation based on the given {@link TermFreqIterator}. + * The implementation might re-sort the data internally. + */ public abstract void build(TermFreqIterator tfit) throws IOException; /** @@ -124,22 +132,7 @@ public abstract class Lookup { */ public abstract List lookup(CharSequence key, boolean onlyMorePopular, int num); - /** - * Modify the lookup data by recording additional data. Optional operation. - * @param key new lookup key - * @param value value to associate with this key - * @return true if new key is added, false if it already exists or operation - * is not supported. - */ - public abstract boolean add(CharSequence key, Object value); - /** - * Get value associated with a specific key. - * @param key lookup key - * @return associated value - */ - public abstract Object get(CharSequence key); - /** * Persist the constructed lookup data to a directory. Optional operation. * @param output {@link OutputStream} to write the data to. @@ -173,4 +166,5 @@ public abstract class Lookup { * @throws IOException when fatal IO error occurs. */ public abstract boolean load(File storeDir) throws IOException; + } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java index 2380724c9a6..020618148be 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java @@ -17,45 +17,166 @@ package org.apache.lucene.search.suggest; * limitations under the License. */ +import java.io.File; import java.io.IOException; import java.util.Comparator; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.search.suggest.fst.Sort; +import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader; +import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; /** - * This wrapper buffers incoming elements and makes sure they are sorted in - * ascending lexicographic order. + * This wrapper buffers incoming elements and makes sure they are sorted based on given comparator. + * @lucene.experimental */ -public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper { - // TODO keep this for now - but the consumer should really sort this stuff on disk with sorter... - private final int[] sortedOrds; - private int currentOrd = -1; - private final BytesRef spare = new BytesRef(); - private final Comparator comp; - - public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator comp) throws IOException { - super(source); - this.sortedOrds = entries.sort(comp); - this.comp = comp; +public class SortedTermFreqIteratorWrapper implements TermFreqIterator { + + private final TermFreqIterator source; + private File tempInput; + private File tempSorted; + private final ByteSequencesReader reader; + private boolean done = false; + + private long weight; + private final BytesRef scratch = new BytesRef(); + private final Comparator comparator; + + public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator comparator) throws IOException { + this(source, comparator, false); } - - @Override - public long weight() { - return freqs[currentOrd]; + + public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator comparator, boolean compareRawBytes) throws IOException { + this.source = source; + this.comparator = comparator; + this.reader = sort(compareRawBytes ? comparator : new BytesOnlyComparator(this.comparator)); } - + @Override public BytesRef next() throws IOException { - if (++curPos < entries.size()) { - return entries.get(spare, (currentOrd = sortedOrds[curPos])); + boolean success = false; + if (done) { + return null; + } + try { + ByteArrayDataInput input = new ByteArrayDataInput(); + if (reader.read(scratch)) { + weight = decode(scratch, input); + success = true; + return scratch; + } + close(); + success = done = true; + return null; + } finally { + if (!success) { + done = true; + close(); + } } - return null; } - + @Override public Comparator getComparator() { - return comp; + return comparator; + } + + @Override + public long weight() { + return weight; + } + + private Sort.ByteSequencesReader sort(Comparator comparator) throws IOException { + String prefix = getClass().getSimpleName(); + File directory = Sort.defaultTempDir(); + tempInput = File.createTempFile(prefix, ".input", directory); + tempSorted = File.createTempFile(prefix, ".sorted", directory); + + final Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); + boolean success = false; + try { + BytesRef spare; + byte[] buffer = new byte[0]; + ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); + + while ((spare = source.next()) != null) { + encode(writer, output, buffer, spare, source.weight()); + } + writer.close(); + new Sort(comparator).sort(tempInput, tempSorted); + ByteSequencesReader reader = new Sort.ByteSequencesReader(tempSorted); + success = true; + return reader; + + } finally { + if (success) { + IOUtils.close(writer); + } else { + try { + IOUtils.closeWhileHandlingException(writer); + } finally { + close(); + } + } + + } + } + + private void close() throws IOException { + if (tempInput != null) { + tempInput.delete(); + } + if (tempSorted != null) { + tempSorted.delete(); + } + IOUtils.close(reader); + } + + private final static class BytesOnlyComparator implements Comparator { + + final Comparator other; + private final BytesRef leftScratch = new BytesRef(); + private final BytesRef rightScratch = new BytesRef(); + + public BytesOnlyComparator(Comparator other) { + this.other = other; + } + + @Override + public int compare(BytesRef left, BytesRef right) { + wrap(leftScratch, left); + wrap(rightScratch, right); + return other.compare(leftScratch, rightScratch); + } + + private void wrap(BytesRef wrapper, BytesRef source) { + wrapper.bytes = source.bytes; + wrapper.offset = source.offset; + wrapper.length = source.length - 8; + + } + } + + protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException { + if (spare.length + 8 >= buffer.length) { + buffer = ArrayUtil.grow(buffer, spare.length + 8); + } + output.reset(buffer); + output.writeBytes(spare.bytes, spare.offset, spare.length); + output.writeLong(weight); + writer.write(buffer, 0, output.getPosition()); + } + + protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) { + tmpInput.reset(scratch.bytes); + tmpInput.skipBytes(scratch.length - 8); // suggestion + separator + scratch.length -= 8; // sep + long + return tmpInput.readLong(); } } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java index 4a7e3d8d027..a97b170bdb4 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java @@ -26,6 +26,7 @@ import org.apache.lucene.util.BytesRef; /** * This wrapper buffers the incoming elements and makes sure they are in * random order. + * @lucene.experimental */ public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper { // TODO keep this for now diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/BytesRefSorter.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/BytesRefSorter.java index c7f42cb812b..3d141023550 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/BytesRefSorter.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/BytesRefSorter.java @@ -18,13 +18,16 @@ package org.apache.lucene.search.suggest.fst; */ import java.io.IOException; -import java.util.Iterator; +import java.util.Comparator; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; /** * Collects {@link BytesRef} and then allows one to iterate over their sorted order. Implementations - * of this interface will be called in a single-threaded scenario. + * of this interface will be called in a single-threaded scenario. + * @lucene.experimental + * @lucene.internal */ public interface BytesRefSorter { /** @@ -42,5 +45,7 @@ public interface BytesRefSorter { * * @throws IOException If an I/O exception occurs. */ - Iterator iterator() throws IOException; + BytesRefIterator iterator() throws IOException; + + Comparator getComparator(); } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java index a28d57f229e..77995c11843 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java @@ -18,59 +18,63 @@ package org.apache.lucene.search.suggest.fst; */ import java.io.*; -import java.util.Iterator; -import java.util.NoSuchElementException; +import java.util.Comparator; import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesReader; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.IOUtils; /** * Builds and iterates over sequences stored on disk. + * @lucene.experimental + * @lucene.internal */ public class ExternalRefSorter implements BytesRefSorter, Closeable { private final Sort sort; private Sort.ByteSequencesWriter writer; private File input; - private File sorted; - + private File sorted; + /** * Will buffer all sequences to a temporary file and then sort (all on-disk). */ public ExternalRefSorter(Sort sort) throws IOException { this.sort = sort; - this.input = File.createTempFile("RefSorter-", ".raw", Sort.defaultTempDir()); + this.input = File.createTempFile("RefSorter-", ".raw", + Sort.defaultTempDir()); this.writer = new Sort.ByteSequencesWriter(input); } - + @Override public void add(BytesRef utf8) throws IOException { - if (writer == null) - throw new IllegalStateException(); + if (writer == null) throw new IllegalStateException(); writer.write(utf8); } - - @Override - public Iterator iterator() throws IOException { + + public BytesRefIterator iterator() throws IOException { if (sorted == null) { closeWriter(); - - sorted = File.createTempFile("RefSorter-", ".sorted", Sort.defaultTempDir()); + + sorted = File.createTempFile("RefSorter-", ".sorted", + Sort.defaultTempDir()); sort.sort(input, sorted); - + input.delete(); input = null; } - - return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted)); + + return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted), + sort.getComparator()); } - + private void closeWriter() throws IOException { if (writer != null) { writer.close(); writer = null; } } - + /** * Removes any written temporary files. */ @@ -83,40 +87,54 @@ public class ExternalRefSorter implements BytesRefSorter, Closeable { if (sorted != null) sorted.delete(); } } - + /** * Iterate over byte refs in a file. */ - class ByteSequenceIterator implements Iterator { - private ByteSequencesReader reader; - private byte[] next; - - public ByteSequenceIterator(ByteSequencesReader reader) throws IOException { + class ByteSequenceIterator implements BytesRefIterator { + private final ByteSequencesReader reader; + private BytesRef scratch = new BytesRef(); + private final Comparator comparator; + + public ByteSequenceIterator(ByteSequencesReader reader, + Comparator comparator) { this.reader = reader; - this.next = reader.read(); - } - - @Override - public boolean hasNext() { - return next != null; + this.comparator = comparator; } @Override - public BytesRef next() { - if (next == null) throw new NoSuchElementException(); - BytesRef r = new BytesRef(next); - try { - next = reader.read(); - if (next == null) { - reader.close(); - } - } catch (IOException e) { - throw new RuntimeException(e); + public BytesRef next() throws IOException { + if (scratch == null) { + return null; + } + boolean success = false; + try { + byte[] next = reader.read(); + if (next != null) { + scratch.bytes = next; + scratch.length = next.length; + scratch.offset = 0; + } else { + IOUtils.close(reader); + scratch = null; + } + success = true; + return scratch; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(reader); + } } - return r; } - + @Override - public void remove() { throw new UnsupportedOperationException(); } + public Comparator getComparator() { + return comparator; + } + } + + @Override + public Comparator getComparator() { + return sort.getComparator(); } } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java index 59fdc4cde75..9e49b1e2795 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java @@ -28,6 +28,7 @@ import org.apache.lucene.util.fst.FST.Arc; * Finite state automata based implementation of "autocomplete" functionality. * * @see FSTCompletionBuilder + * @lucene.experimental */ // TODO: we could store exact weights as outputs from the FST (int4 encoded @@ -159,10 +160,10 @@ public class FSTCompletion { * @param utf8 * The sequence of utf8 bytes to follow. * - * @return Returns the bucket number of the match or null if no + * @return Returns the bucket number of the match or -1 if no * match was found. */ - private Integer getExactMatchStartingFromRootArc( + private int getExactMatchStartingFromRootArc( int rootArcIndex, BytesRef utf8) { // Get the UTF-8 bytes representation of the input key. try { @@ -186,7 +187,7 @@ public class FSTCompletion { } // No match. - return null; + return -1; } /** @@ -273,8 +274,8 @@ public class FSTCompletion { // exact match, if requested. if (exactFirst) { if (!checkExistingAndReorder(res, key)) { - Integer exactMatchBucket = getExactMatchStartingFromRootArc(i, key); - if (exactMatchBucket != null) { + int exactMatchBucket = getExactMatchStartingFromRootArc(i, key); + if (exactMatchBucket != -1) { // Insert as the first result and truncate at num. while (res.size() >= num) { res.remove(res.size() - 1); @@ -385,10 +386,10 @@ public class FSTCompletion { } /** - * Returns the bucket assigned to a given key (if found) or null if + * Returns the bucket assigned to a given key (if found) or -1 if * no exact match exists. */ - public Integer getBucket(CharSequence key) { + public int getBucket(CharSequence key) { return getExactMatchStartingFromRootArc(0, new BytesRef(key)); } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java index f82194d6c2b..ba4c5c7cf2f 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java @@ -19,9 +19,9 @@ package org.apache.lucene.search.suggest.fst; import java.io.Closeable; import java.io.IOException; -import java.util.Iterator; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.*; @@ -98,6 +98,7 @@ import org.apache.lucene.util.fst.*; * change, requiring you to rebuild the FST suggest index. * * @see FSTCompletion + * @lucene.experimental */ public class FSTCompletionBuilder { /** @@ -143,10 +144,11 @@ public class FSTCompletionBuilder { /** * Creates an {@link FSTCompletion} with default options: 10 buckets, exact match - * promoted to first position and {@link InMemorySorter}. + * promoted to first position and {@link InMemorySorter} with a comparator obtained from + * {@link BytesRef#getUTF8SortedAsUnicodeComparator()}. */ public FSTCompletionBuilder() { - this(DEFAULT_BUCKETS, new InMemorySorter(), Integer.MAX_VALUE); + this(DEFAULT_BUCKETS, new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator()), Integer.MAX_VALUE); } /** @@ -237,10 +239,12 @@ public class FSTCompletionBuilder { shareMaxTailLength, outputs, null, false); BytesRef scratch = new BytesRef(); + BytesRef entry; final IntsRef scratchIntsRef = new IntsRef(); int count = 0; - for (Iterator i = sorter.iterator(); i.hasNext(); count++) { - BytesRef entry = i.next(); + BytesRefIterator iter = sorter.iterator(); + while((entry = iter.next()) != null) { + count++; if (scratch.compareTo(entry) != 0) { builder.add(Util.toIntsRef(entry, scratchIntsRef), empty); scratch.copyBytes(entry); diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java index aee2ea1c502..9bd0ce79170 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java @@ -59,6 +59,7 @@ import org.apache.lucene.util.fst.NoOutputs; * use {@link FSTCompletion} directly or {@link TSTLookup}, for example. * * @see FSTCompletion + * @lucene.experimental */ public class FSTCompletionLookup extends Lookup { /** @@ -171,7 +172,7 @@ public class FSTCompletionLookup extends Lookup { } output.reset(buffer); - output.writeInt(FloatMagic.toSortable(tfit.weight())); + output.writeInt(encodeWeight(tfit.weight())); output.writeBytes(spare.bytes, spare.offset, spare.length); writer.write(buffer, 0, output.getPosition()); } @@ -188,13 +189,13 @@ public class FSTCompletionLookup extends Lookup { reader = new Sort.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; - float previousScore = 0; + int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.read(tmp1)) { input.reset(tmp1.bytes); - float currentScore = FloatMagic.fromSortable(input.readInt()); + int currentScore = input.readInt(); int bucket; if (line > 0 && currentScore == previousScore) { @@ -230,6 +231,14 @@ public class FSTCompletionLookup extends Lookup { tempSorted.delete(); } } + + /** weight -> cost */ + private static int encodeWeight(long value) { + if (value < Integer.MIN_VALUE || value > Integer.MAX_VALUE) { + throw new UnsupportedOperationException("cannot encode value: " + value); + } + return (int)value; + } @Override public List lookup(CharSequence key, boolean higherWeightsFirst, int num) { @@ -250,19 +259,9 @@ public class FSTCompletionLookup extends Lookup { return results; } - @Override - public boolean add(CharSequence key, Object value) { - // Not supported. - return false; - } - - @Override public Object get(CharSequence key) { - Integer bucket = normalCompletion.getBucket(key); - if (bucket == null) - return null; - else - return (float) normalCompletion.getBucket(key) / normalCompletion.getBucketCount(); + final int bucket = normalCompletion.getBucket(key); + return bucket == -1 ? null : Long.valueOf(bucket); } /** diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FloatMagic.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FloatMagic.java deleted file mode 100644 index 16583566fa1..00000000000 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FloatMagic.java +++ /dev/null @@ -1,75 +0,0 @@ -package org.apache.lucene.search.suggest.fst; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.NumericUtils; - -/** - * Converts normalized float representations ({@link Float#floatToIntBits(float)}) - * into integers that are directly sortable in int4 representation (or unsigned values or - * after promoting to a long with higher 32-bits zeroed). - */ -class FloatMagic { - /** - * Convert a float to a directly sortable unsigned integer. For sortable signed - * integers, see {@link NumericUtils#floatToSortableInt(float)}. - */ - public static int toSortable(float f) { - return floatBitsToUnsignedOrdered(Float.floatToRawIntBits(f)); - } - - /** - * Back from {@link #toSortable(float)} to float. - */ - public static float fromSortable(int v) { - return Float.intBitsToFloat(unsignedOrderedToFloatBits(v)); - } - - /** - * Convert float bits to directly sortable bits. - * Normalizes all NaNs to canonical form. - */ - static int floatBitsToUnsignedOrdered(int v) { - // Canonicalize NaN ranges. I assume this check will be faster here than - // (v == v) == false on the FPU? We don't distinguish between different - // flavors of NaNs here (see http://en.wikipedia.org/wiki/NaN). I guess - // in Java this doesn't matter much anyway. - if ((v & 0x7fffffff) > 0x7f800000) { - // Apply the logic below to a canonical "quiet NaN" - return 0x7fc00000 ^ 0x80000000; - } - - if (v < 0) { - // Reverse the order of negative values and push them before positive values. - return ~v; - } else { - // Shift positive values after negative, but before NaNs, they're sorted already. - return v ^ 0x80000000; - } - } - - /** - * Back from {@link #floatBitsToUnsignedOrdered(int)}. - */ - static int unsignedOrderedToFloatBits(int v) { - if (v < 0) - return v & ~0x80000000; - else - return ~v; - } -} diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java index 1e293530a46..ce6a17d721f 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/InMemorySorter.java @@ -17,29 +17,40 @@ package org.apache.lucene.search.suggest.fst; * limitations under the License. */ -import java.util.*; +import java.util.Comparator; +import org.apache.lucene.search.suggest.BytesRefList; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; /** * An {@link BytesRefSorter} that keeps all the entries in memory. + * @lucene.experimental + * @lucene.internal */ public final class InMemorySorter implements BytesRefSorter { - // TODO: use a single byte[] to back up all entries? - private final ArrayList refs = new ArrayList(); - + private final BytesRefList buffer = new BytesRefList(); private boolean closed = false; + private final Comparator comparator; + public InMemorySorter(Comparator comparator) { + this.comparator = comparator; + } + @Override public void add(BytesRef utf8) { if (closed) throw new IllegalStateException(); - refs.add(BytesRef.deepCopyOf(utf8)); + buffer.append(utf8); } @Override - public Iterator iterator() { + public BytesRefIterator iterator() { closed = true; - Collections.sort(refs, BytesRef.getUTF8SortedAsUnicodeComparator()); - return Collections.unmodifiableCollection(refs).iterator(); + return buffer.iterator(comparator); + } + + @Override + public Comparator getComparator() { + return comparator; } } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/Sort.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/Sort.java index 47942ed2d9e..8d9e5e3724a 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/Sort.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/Sort.java @@ -20,15 +20,10 @@ package org.apache.lucene.search.suggest.fst; import java.io.*; import java.util.*; +import org.apache.lucene.search.suggest.BytesRefList; import org.apache.lucene.util.*; import org.apache.lucene.util.PriorityQueue; -// TODO: the buffer is currently byte[][] which with very small arrays will terribly overallocate -// memory (alignments) and make GC very happy. -// -// We could move it to a single byte[] + and use custom sorting, but we'd need to check if this -// yields any improvement first. - /** * On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following * fields: @@ -38,6 +33,8 @@ import org.apache.lucene.util.PriorityQueue; * * * @see #sort(File, File) + * @lucene.experimental + * @lucene.internal */ public final class Sort { public final static int MB = 1024 * 1024; @@ -59,11 +56,6 @@ public final class Sort { */ public final static int MAX_TEMPFILES = 128; - /** - * Minimum slot buffer expansion. - */ - private final static int MIN_EXPECTED_GROWTH = 1000; - /** * A bit more descriptive unit for constructors. * @@ -111,21 +103,6 @@ public final class Sort { } } - /** - * byte[] in unsigned byte order. - */ - static final Comparator unsignedByteOrderComparator = new Comparator() { - public int compare(byte[] left, byte[] right) { - final int max = Math.min(left.length, right.length); - for (int i = 0, j = 0; i < max; i++, j++) { - int diff = (left[i] & 0xff) - (right[j] & 0xff); - if (diff != 0) - return diff; - } - return left.length - right.length; - } - }; - /** * Sort info (debugging mostly). */ @@ -149,14 +126,15 @@ public final class Sort { } } - private final static byte [][] EMPTY = new byte [0][]; - private final BufferSize ramBufferSize; private final File tempDirectory; - - private byte [][] buffer = new byte [0][]; + + private final BytesRefList buffer = new BytesRefList(); private SortInfo sortInfo; private int maxTempFiles; + private final Comparator comparator; + + public static final Comparator DEFAULT_COMPARATOR = BytesRef.getUTF8SortedAsUnicodeComparator(); /** * Defaults constructor. @@ -165,13 +143,17 @@ public final class Sort { * @see BufferSize#automatic() */ public Sort() throws IOException { - this(BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES); + this(DEFAULT_COMPARATOR, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES); + } + + public Sort(Comparator comparator) throws IOException { + this(comparator, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES); } /** * All-details constructor. */ - public Sort(BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) { + public Sort(Comparator comparator, BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) { if (ramBufferSize.bytes < ABSOLUTE_MIN_SORT_BUFFER_SIZE) { throw new IllegalArgumentException(MIN_BUFFER_SIZE_MSG + ": " + ramBufferSize.bytes); } @@ -183,6 +165,7 @@ public final class Sort { this.ramBufferSize = ramBufferSize; this.tempDirectory = tempDirectory; this.maxTempFiles = maxTempfiles; + this.comparator = comparator; } /** @@ -283,23 +266,25 @@ public final class Sort { /** Sort a single partition in-memory. */ protected File sortPartition(int len) throws IOException { - byte [][] data = this.buffer; + BytesRefList data = this.buffer; File tempFile = File.createTempFile("sort", "partition", tempDirectory); long start = System.currentTimeMillis(); - Arrays.sort(data, 0, len, unsignedByteOrderComparator); sortInfo.sortTime += (System.currentTimeMillis() - start); - ByteSequencesWriter out = new ByteSequencesWriter(tempFile); + final ByteSequencesWriter out = new ByteSequencesWriter(tempFile); + BytesRef spare; try { - for (int i = 0; i < len; i++) { - assert data[i].length <= Short.MAX_VALUE; - out.write(data[i]); + BytesRefIterator iter = buffer.iterator(comparator); + while((spare = iter.next()) != null) { + assert spare.length <= Short.MAX_VALUE; + out.write(spare); } + out.close(); // Clean up the buffer for the next partition. - this.buffer = EMPTY; + data.clear(); return tempFile; } finally { IOUtils.close(out); @@ -314,7 +299,7 @@ public final class Sort { PriorityQueue queue = new PriorityQueue(merges.size()) { protected boolean lessThan(FileAndTop a, FileAndTop b) { - return a.current.compareTo(b.current) < 0; + return comparator.compare(a.current, b.current) < 0; } }; @@ -359,33 +344,18 @@ public final class Sort { /** Read in a single partition of data */ int readPartition(ByteSequencesReader reader) throws IOException { long start = System.currentTimeMillis(); - - // We will be reallocating from scratch. - Arrays.fill(this.buffer, null); - - int bytesLimit = this.ramBufferSize.bytes; - byte [][] data = this.buffer; - byte[] line; - int linesRead = 0; - while ((line = reader.read()) != null) { - if (linesRead + 1 >= data.length) { - data = Arrays.copyOf(data, - ArrayUtil.oversize(linesRead + MIN_EXPECTED_GROWTH, - RamUsageEstimator.NUM_BYTES_OBJECT_REF)); - } - data[linesRead++] = line; - + final BytesRef scratch = new BytesRef(); + while ((scratch.bytes = reader.read()) != null) { + scratch.length = scratch.bytes.length; + buffer.append(scratch); // Account for the created objects. // (buffer slots do not account to buffer size.) - bytesLimit -= line.length + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER; - if (bytesLimit < 0) { + if (ramBufferSize.bytes < buffer.bytesUsed()) { break; } } - this.buffer = data; - sortInfo.readTime += (System.currentTimeMillis() - start); - return linesRead; + return buffer.size(); } static class FileAndTop { @@ -515,5 +485,9 @@ public final class Sort { ((Closeable) is).close(); } } + } + + public Comparator getComparator() { + return comparator; } } \ No newline at end of file diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java index f5f37c36c8a..330cf3c82a4 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java @@ -28,6 +28,8 @@ import java.util.List; import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper; +import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.InputStreamDataInput; @@ -102,72 +104,27 @@ public class WFSTCompletionLookup extends Lookup { @Override public void build(TermFreqIterator iterator) throws IOException { - String prefix = getClass().getSimpleName(); - File directory = Sort.defaultTempDir(); - File tempInput = File.createTempFile(prefix, ".input", directory); - File tempSorted = File.createTempFile(prefix, ".sorted", directory); - - Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); - Sort.ByteSequencesReader reader = null; BytesRef scratch = new BytesRef(); - - boolean success = false; - try { - byte [] buffer = new byte [0]; - ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); - BytesRef spare; - while ((spare = iterator.next()) != null) { - if (spare.length + 5 >= buffer.length) { - buffer = ArrayUtil.grow(buffer, spare.length + 5); - } - - output.reset(buffer); - output.writeBytes(spare.bytes, spare.offset, spare.length); - output.writeByte((byte)0); // separator: not used, just for sort order - output.writeInt((int)encodeWeight(iterator.weight())); - writer.write(buffer, 0, output.getPosition()); - } - writer.close(); - new Sort().sort(tempInput, tempSorted); - reader = new Sort.ByteSequencesReader(tempSorted); + TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator, + BytesRef.getUTF8SortedAsUnicodeComparator()); + IntsRef scratchInts = new IntsRef(); + BytesRef previous = null; + PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); + while ((scratch = iter.next()) != null) { + long cost = iter.weight(); - PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - - BytesRef previous = null; - BytesRef suggestion = new BytesRef(); - IntsRef scratchInts = new IntsRef(); - ByteArrayDataInput input = new ByteArrayDataInput(); - while (reader.read(scratch)) { - suggestion.bytes = scratch.bytes; - suggestion.offset = scratch.offset; - suggestion.length = scratch.length - 5; // int + separator - - input.reset(scratch.bytes); - input.skipBytes(suggestion.length + 1); // suggestion + separator - long cost = input.readInt(); - - if (previous == null) { - previous = new BytesRef(); - } else if (suggestion.equals(previous)) { - continue; // for duplicate suggestions, the best weight is actually added - } - Util.toIntsRef(suggestion, scratchInts); - builder.add(scratchInts, cost); - previous.copyBytes(suggestion); + if (previous == null) { + previous = new BytesRef(); + } else if (scratch.equals(previous)) { + continue; // for duplicate suggestions, the best weight is actually + // added } - fst = builder.finish(); - success = true; - } finally { - if (success) { - IOUtils.close(reader, writer); - } else { - IOUtils.closeWhileHandlingException(reader, writer); - } - - tempInput.delete(); - tempSorted.delete(); + Util.toIntsRef(scratch, scratchInts); + builder.add(scratchInts, cost); + previous.copyBytes(scratch); } + fst = builder.finish(); } @Override @@ -270,16 +227,10 @@ public class WFSTCompletionLookup extends Lookup { return output; } - @Override - public boolean add(CharSequence key, Object value) { - return false; // Not supported. - } - /** * Returns the weight associated with an input string, * or null if it does not exist. */ - @Override public Object get(CharSequence key) { Arc arc = new Arc(); Long result = null; @@ -289,23 +240,51 @@ public class WFSTCompletionLookup extends Lookup { if (result == null || !arc.isFinal()) { return null; } else { - return decodeWeight(result + arc.nextFinalOutput); + return Integer.valueOf(decodeWeight(result + arc.nextFinalOutput)); } } /** cost -> weight */ - private static float decodeWeight(long encoded) { - return Integer.MAX_VALUE - encoded; + private static int decodeWeight(long encoded) { + return (int)(Integer.MAX_VALUE - encoded); } /** weight -> cost */ - private static long encodeWeight(float value) { - if (Float.isNaN(value) || Float.isInfinite(value) || value < 0 || value > Integer.MAX_VALUE) { + private static int encodeWeight(long value) { + if (value < 0 || value > Integer.MAX_VALUE) { throw new UnsupportedOperationException("cannot encode value: " + value); } return Integer.MAX_VALUE - (int)value; } + private final class WFSTTermFreqIteratorWrapper extends SortedTermFreqIteratorWrapper { + + WFSTTermFreqIteratorWrapper(TermFreqIterator source, + Comparator comparator) throws IOException { + super(source, comparator, true); + } + + @Override + protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException { + if (spare.length + 5 >= buffer.length) { + buffer = ArrayUtil.grow(buffer, spare.length + 5); + } + output.reset(buffer); + output.writeBytes(spare.bytes, spare.offset, spare.length); + output.writeByte((byte)0); // separator: not used, just for sort order + output.writeInt(encodeWeight(weight)); + writer.write(buffer, 0, output.getPosition()); + } + + @Override + protected long decode(BytesRef scratch, ByteArrayDataInput tmpInput) { + tmpInput.reset(scratch.bytes); + tmpInput.skipBytes(scratch.length - 4); // suggestion + separator + scratch.length -= 5; // sep + long + return tmpInput.readInt(); + } + } + static final Comparator weightComparator = new Comparator () { public int compare(Long left, Long right) { return left.compareTo(right); diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java index 56a47514506..b7bb15e8a46 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java @@ -55,24 +55,22 @@ public class JaspellLookup extends Lookup { final CharsRef charsSpare = new CharsRef(); while ((spare = tfit.next()) != null) { - float freq = tfit.weight(); + final long weight = tfit.weight(); if (spare.length == 0) { continue; } charsSpare.grow(spare.length); UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); - trie.put(charsSpare.toString(), new Float(freq)); + trie.put(charsSpare.toString(), Long.valueOf(weight)); } } - @Override public boolean add(CharSequence key, Object value) { trie.put(key, value); // XXX return false; } - @Override public Object get(CharSequence key) { return trie.get(key); } @@ -95,7 +93,7 @@ public class JaspellLookup extends Lookup { if (onlyMorePopular) { LookupPriorityQueue queue = new LookupPriorityQueue(num); for (String s : list) { - float freq = (Float)trie.get(s); + long freq = ((Number)trie.get(s)).longValue(); queue.insertWithOverflow(new LookupResult(new CharsRef(s), freq)); } for (LookupResult lr : queue.getResults()) { @@ -104,7 +102,7 @@ public class JaspellLookup extends Lookup { } else { for (int i = 0; i < maxCnt; i++) { String s = list.get(i); - float freq = (Float)trie.get(s); + long freq = ((Number)trie.get(s)).longValue(); res.add(new LookupResult(new CharsRef(s), freq)); } } @@ -131,7 +129,7 @@ public class JaspellLookup extends Lookup { node.splitchar = in.readChar(); byte mask = in.readByte(); if ((mask & HAS_VALUE) != 0) { - node.data = new Float(in.readFloat()); + node.data = Long.valueOf(in.readLong()); } if ((mask & LO_KID) != 0) { TSTNode kid = trie.new TSTNode('\0', node); @@ -171,7 +169,7 @@ public class JaspellLookup extends Lookup { if (node.data != null) mask |= HAS_VALUE; out.writeByte(mask); if (node.data != null) { - out.writeFloat((Float)node.data); + out.writeLong(((Number)node.data).longValue()); } writeRecursively(out, node.relatives[TSTNode.LOKID]); writeRecursively(out, node.relatives[TSTNode.EQKID]); diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java index 56b00a3ca6a..99e4e6a8c46 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java @@ -50,26 +50,24 @@ public class TSTLookup extends Lookup { } ArrayList tokens = new ArrayList(); - ArrayList vals = new ArrayList(); + ArrayList vals = new ArrayList(); BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.next()) != null) { charsSpare.grow(spare.length); UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); tokens.add(charsSpare.toString()); - vals.add(new Float(tfit.weight())); + vals.add(Long.valueOf(tfit.weight())); } autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root); } - @Override public boolean add(CharSequence key, Object value) { autocomplete.insert(root, key, value, 0); // XXX we don't know if a new node was created return true; } - @Override public Object get(CharSequence key) { List list = autocomplete.prefixCompletion(root, key, 0); if (list == null || list.isEmpty()) { @@ -107,7 +105,7 @@ public class TSTLookup extends Lookup { if (onlyMorePopular) { LookupPriorityQueue queue = new LookupPriorityQueue(num); for (TernaryTreeNode ttn : list) { - queue.insertWithOverflow(new LookupResult(ttn.token, (Float)ttn.val)); + queue.insertWithOverflow(new LookupResult(ttn.token, ((Number)ttn.val).longValue())); } for (LookupResult lr : queue.getResults()) { res.add(lr); @@ -115,7 +113,7 @@ public class TSTLookup extends Lookup { } else { for (int i = 0; i < maxCnt; i++) { TernaryTreeNode ttn = list.get(i); - res.add(new LookupResult(ttn.token, (Float)ttn.val)); + res.add(new LookupResult(ttn.token, ((Number)ttn.val).longValue())); } } return res; @@ -146,7 +144,7 @@ public class TSTLookup extends Lookup { node.token = in.readUTF(); } if ((mask & HAS_VALUE) != 0) { - node.val = new Float(in.readFloat()); + node.val = Long.valueOf(in.readLong()); } if ((mask & LO_KID) != 0) { node.loKid = new TernaryTreeNode(); @@ -184,7 +182,7 @@ public class TSTLookup extends Lookup { if (node.val != null) mask |= HAS_VALUE; out.writeByte(mask); if (node.token != null) out.writeUTF(node.token); - if (node.val != null) out.writeFloat((Float)node.val); + if (node.val != null) out.writeLong(((Number)node.val).longValue()); // recurse and write kids if (node.loKid != null) { writeRecursively(out, node.loKid); diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java index a2deec4d6c9..73f5ae82dad 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java +++ b/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java @@ -17,8 +17,10 @@ package org.apache.lucene.search.suggest; import java.io.File; +import java.util.List; import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.fst.FSTCompletionLookup; import org.apache.lucene.search.suggest.jaspell.JaspellLookup; import org.apache.lucene.search.suggest.tst.TSTLookup; @@ -74,16 +76,18 @@ public class PersistenceTest extends LuceneTestCase { lookup.load(storeDir); // Assert validity. - float previous = Float.NEGATIVE_INFINITY; + long previous = Long.MIN_VALUE; for (TermFreq k : keys) { - Float val = (Float) lookup.get(_TestUtil.bytesToCharSequence(k.term, random)); - assertNotNull(k.term.utf8ToString(), val); + List list = lookup.lookup(_TestUtil.bytesToCharSequence(k.term, random), false, 1); + assertEquals(1, list.size()); + LookupResult lookupResult = list.get(0); + assertNotNull(k.term.utf8ToString(), lookupResult.key); if (supportsExactWeights) { - assertEquals(k.term.utf8ToString(), Float.valueOf(k.v), val); + assertEquals(k.term.utf8ToString(), k.v, lookupResult.value); } else { - assertTrue(val + ">=" + previous, val >= previous); - previous = val.floatValue(); + assertTrue(lookupResult.value + ">=" + previous, lookupResult.value >= previous); + previous = lookupResult.value; } } } diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefList.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefList.java index 81952818ce2..ca997fabc28 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefList.java +++ b/modules/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefList.java @@ -29,59 +29,79 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; public class TestBytesRefList extends LuceneTestCase { - + public void testAppend() throws IOException { BytesRefList list = new BytesRefList(); List stringList = new ArrayList(); - int entries = atLeast(500); - BytesRef spare = new BytesRef(); - for (int i = 0; i < entries; i++) { - String randomRealisticUnicodeString = _TestUtil - .randomRealisticUnicodeString(random); - spare.copyChars(randomRealisticUnicodeString); - list.append(spare); - stringList.add(randomRealisticUnicodeString); - } - for (int i = 0; i < entries; i++) { - assertNotNull(list.get(spare, i)); - assertEquals("entry " + i + " doesn't match", stringList.get(i), - spare.utf8ToString()); - } - - // check random - for (int i = 0; i < entries; i++) { - int e = random.nextInt(entries); - assertNotNull(list.get(spare, e)); - assertEquals("entry " + i + " doesn't match", stringList.get(e), - spare.utf8ToString()); - } - for (int i = 0; i < 2; i++) { - - BytesRefIterator iterator = list.iterator(); - for (String string : stringList) { - assertEquals(string, iterator.next().utf8ToString()); + for (int j = 0; j < 2; j++) { + if (j > 0 && random.nextBoolean()) { + list.clear(); + stringList.clear(); + } + int entries = atLeast(500); + BytesRef spare = new BytesRef(); + for (int i = 0; i < entries; i++) { + String randomRealisticUnicodeString = _TestUtil + .randomRealisticUnicodeString(random); + spare.copyChars(randomRealisticUnicodeString); + list.append(spare); + stringList.add(randomRealisticUnicodeString); + } + for (int i = 0; i < entries; i++) { + assertNotNull(list.get(spare, i)); + assertEquals("entry " + i + " doesn't match", stringList.get(i), + spare.utf8ToString()); + } + + // check random + for (int i = 0; i < entries; i++) { + int e = random.nextInt(entries); + assertNotNull(list.get(spare, e)); + assertEquals("entry " + i + " doesn't match", stringList.get(e), + spare.utf8ToString()); + } + for (int i = 0; i < 2; i++) { + + BytesRefIterator iterator = list.iterator(); + for (String string : stringList) { + assertEquals(string, iterator.next().utf8ToString()); + } } } } - - public void testSort() { + + public void testSort() throws IOException { BytesRefList list = new BytesRefList(); List stringList = new ArrayList(); - int entries = atLeast(500); - BytesRef spare = new BytesRef(); - for (int i = 0; i < entries; i++) { - String randomRealisticUnicodeString = _TestUtil.randomRealisticUnicodeString(random); - spare.copyChars(randomRealisticUnicodeString); - list.append(spare); - stringList.add(randomRealisticUnicodeString); - } - Collections.sort(stringList); - int[] sortedOrds = list.sort(BytesRef.getUTF8SortedAsUTF16Comparator()); - for (int i = 0; i < entries; i++) { - assertNotNull(list.get(spare, sortedOrds[i])); - assertEquals("entry " + i + " doesn't match", stringList.get(i), - spare.utf8ToString()); + + for (int j = 0; j < 2; j++) { + if (j > 0 && random.nextBoolean()) { + list.clear(); + stringList.clear(); + } + int entries = atLeast(500); + BytesRef spare = new BytesRef(); + for (int i = 0; i < entries; i++) { + String randomRealisticUnicodeString = _TestUtil + .randomRealisticUnicodeString(random); + spare.copyChars(randomRealisticUnicodeString); + list.append(spare); + stringList.add(randomRealisticUnicodeString); + } + + Collections.sort(stringList); + BytesRefIterator iter = list.iterator(BytesRef + .getUTF8SortedAsUTF16Comparator()); + int i = 0; + while ((spare = iter.next()) != null) { + assertEquals("entry " + i + " doesn't match", stringList.get(i), + spare.utf8ToString()); + i++; + } + assertNull(iter.next()); + assertEquals(i, stringList.size()); } } + } diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java index 6e74bc20ec9..5638894b83d 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java +++ b/modules/suggest/src/test/org/apache/lucene/search/suggest/TestTermFreqIterator.java @@ -17,12 +17,16 @@ package org.apache.lucene.search.suggest; * the License. */ +import java.util.Comparator; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -38,7 +42,8 @@ public class TestTermFreqIterator extends LuceneTestCase { public void testTerms() throws Exception { int num = atLeast(10000); - TreeMap sorted = new TreeMap(); + Comparator comparator = random.nextBoolean() ? BytesRef.getUTF8SortedAsUnicodeComparator() : BytesRef.getUTF8SortedAsUTF16Comparator(); + TreeMap sorted = new TreeMap(comparator); TermFreq[] unsorted = new TermFreq[num]; for (int i = 0; i < num; i++) { @@ -52,13 +57,13 @@ public class TestTermFreqIterator extends LuceneTestCase { } // test the sorted iterator wrapper - TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), BytesRef.getUTF8SortedAsUnicodeComparator()); + TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator); Iterator> expected = sorted.entrySet().iterator(); while (expected.hasNext()) { Map.Entry entry = expected.next(); assertEquals(entry.getKey(), wrapper.next()); - assertEquals(entry.getValue().longValue(), wrapper.weight(), 0F); + assertEquals(entry.getValue().longValue(), wrapper.weight()); } assertNull(wrapper.next()); @@ -72,4 +77,57 @@ public class TestTermFreqIterator extends LuceneTestCase { } assertEquals(sorted, actual); } + + + public void testRaw() throws Exception { + int num = atLeast(10000); + + Comparator comparator = BytesRef.getUTF8SortedAsUnicodeComparator(); + BytesRefHash sorted = new BytesRefHash(); + TermFreq[] unsorted = new TermFreq[num]; + byte[] buffer = new byte[0]; + ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); + + for (int i = 0; i < num; i++) { + BytesRef spare; + long weight; + do { + spare = new BytesRef(_TestUtil.randomUnicodeString(random)); + if (spare.length + 8 >= buffer.length) { + buffer = ArrayUtil.grow(buffer, spare.length + 8); + } + output.reset(buffer); + output.writeBytes(spare.bytes, spare.offset, spare.length); + weight = random.nextLong(); + output.writeLong(weight); + + } while (sorted.add(new BytesRef(buffer, 0, output.getPosition())) < 0); + unsorted[i] = new TermFreq(spare, weight); + } + + // test the sorted iterator wrapper + TermFreqIterator wrapper = new SortedTermFreqIteratorWrapper(new TermFreqArrayIterator(unsorted), comparator, true); + int[] sort = sorted.sort(comparator); + int size = sorted.size(); + BytesRef spare = new BytesRef(); + for (int i = 0; i < size; i++) { + sorted.get(sort[i], spare); + spare.length -= 8; // sub the long value + assertEquals(spare, wrapper.next()); + spare.offset = spare.offset + spare.length; + spare.length = 8; + assertEquals(asLong(spare), wrapper.weight()); + } + assertNull(wrapper.next()); + } + + public static long asLong(BytesRef b) { + return (((long) asIntInternal(b, b.offset) << 32) | asIntInternal(b, + b.offset + 4) & 0xFFFFFFFFL); + } + + private static int asIntInternal(BytesRef b, int pos) { + return ((b.bytes[pos++] & 0xFF) << 24) | ((b.bytes[pos++] & 0xFF) << 16) + | ((b.bytes[pos++] & 0xFF) << 8) | (b.bytes[pos] & 0xFF); + } } diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java index cb62b2ae301..5c06670a3b2 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java +++ b/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java @@ -17,9 +17,8 @@ package org.apache.lucene.search.suggest.fst; * limitations under the License. */ -import java.util.Iterator; - import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.LuceneTestCase; import org.junit.Test; @@ -31,7 +30,7 @@ public class BytesRefSortersTest extends LuceneTestCase { @Test public void testInMemorySorter() throws Exception { - check(new InMemorySorter()); + check(new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator())); } private void check(BytesRefSorter sorter) throws Exception { @@ -42,8 +41,8 @@ public class BytesRefSortersTest extends LuceneTestCase { } // Create two iterators and check that they're aligned with each other. - Iterator i1 = sorter.iterator(); - Iterator i2 = sorter.iterator(); + BytesRefIterator i1 = sorter.iterator(); + BytesRefIterator i2 = sorter.iterator(); // Verify sorter contract. try { @@ -52,10 +51,12 @@ public class BytesRefSortersTest extends LuceneTestCase { } catch (IllegalStateException e) { // Expected. } - - while (i1.hasNext() && i2.hasNext()) { - assertEquals(i1.next(), i2.next()); + BytesRef spare1; + BytesRef spare2; + while ((spare1 = i1.next()) != null && (spare2 = i2.next()) != null) { + assertEquals(spare1, spare2); } - assertEquals(i1.hasNext(), i2.hasNext()); + assertNull(i1.next()); + assertNull(i2.next()); } } diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java index f97d6b5c1dc..339282e642b 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java +++ b/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java @@ -165,9 +165,9 @@ public class FSTCompletionTest extends LuceneTestCase { // All the weights were constant, so all returned buckets must be constant, whatever they // are. - Float previous = null; + Long previous = null; for (TermFreq tf : keys) { - Float current = (Float)lookup.get(_TestUtil.bytesToCharSequence(tf.term, random)); + Long current = ((Number)lookup.get(_TestUtil.bytesToCharSequence(tf.term, random))).longValue(); if (previous != null) { assertEquals(previous, current); } @@ -181,7 +181,7 @@ public class FSTCompletionTest extends LuceneTestCase { FSTCompletionLookup lookup = new FSTCompletionLookup(); lookup.build(new TermFreqArrayIterator(input)); for (TermFreq tf : input) { - assertTrue("Not found: " + tf.term.toString(), lookup.get(_TestUtil.bytesToCharSequence(tf.term, random)) != null); + assertNotNull("Not found: " + tf.term.toString(), lookup.get(_TestUtil.bytesToCharSequence(tf.term, random))); assertEquals(tf.term.utf8ToString(), lookup.lookup(_TestUtil.bytesToCharSequence(tf.term, random), true, 1).get(0).key.toString()); } diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FloatMagicTest.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FloatMagicTest.java deleted file mode 100644 index 2129142aabd..00000000000 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FloatMagicTest.java +++ /dev/null @@ -1,140 +0,0 @@ -package org.apache.lucene.search.suggest.fst; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.*; - -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.NumericUtils; -import org.junit.Ignore; -import org.junit.Test; - -public class FloatMagicTest extends LuceneTestCase { - public void testFloatMagic() { - ArrayList floats = new ArrayList(Arrays.asList( - Float.intBitsToFloat(0x7f800001), // NaN (invalid combination). - Float.intBitsToFloat(0x7fffffff), // NaN (invalid combination). - Float.intBitsToFloat(0xff800001), // NaN (invalid combination). - Float.intBitsToFloat(0xffffffff), // NaN (invalid combination). - Float.POSITIVE_INFINITY, - Float.MAX_VALUE, - 100f, - 0f, - 0.1f, - Float.MIN_VALUE, - Float.NaN, - -0.0f, - -Float.MIN_VALUE, - -0.1f, - -1f, - -10f, - Float.NEGATIVE_INFINITY)); - - // Sort them using juc. - Collections.sort(floats); - - // Convert to sortable int4 representation (as long to have an unsigned sort). - long [] int4 = new long [floats.size()]; - for (int i = 0; i < floats.size(); i++) { - int4[i] = FloatMagic.toSortable(floats.get(i)) & 0xffffffffL; - - /* - System.out.println( - String.format("raw %8s sortable %8s %8s numutils %8s %s", - Integer.toHexString(Float.floatToRawIntBits(floats.get(i))), - Integer.toHexString(FloatMagic.toSortable(floats.get(i))), - Integer.toHexString(FloatMagic.unsignedOrderedToFloatBits(FloatMagic.toSortable(floats.get(i)))), - Integer.toHexString(NumericUtils.floatToSortableInt(floats.get(i))), - floats.get(i))); - */ - } - - // Sort and compare. Should be identical order. - Arrays.sort(int4); - ArrayList backFromFixed = new ArrayList(); - for (int i = 0; i < int4.length; i++) { - backFromFixed.add(FloatMagic.fromSortable((int) int4[i])); - } - - /* - for (int i = 0; i < int4.length; i++) { - System.out.println( - floats.get(i) + " " + FloatMagic.fromSortable((int) int4[i])); - } - */ - - assertEquals(floats, backFromFixed); - } - - @Ignore("Once checked, valid forever?") @Test - public void testRoundTripFullRange() { - int i = 0; - do { - float f = Float.intBitsToFloat(i); - float f2 = FloatMagic.fromSortable(FloatMagic.toSortable(f)); - - if (!((Float.isNaN(f) && Float.isNaN(f2)) || f == f2)) { - throw new RuntimeException("! " + Integer.toHexString(i) + "> " + f + " " + f2); - } - - if ((i & 0xffffff) == 0) { - System.out.println(Integer.toHexString(i)); - } - - i++; - } while (i != 0); - } - - @Ignore("Once checked, valid forever?") @Test - public void testIncreasingFullRange() { - // -infinity ... -0.0 - for (int i = 0xff800000; i != 0x80000000; i--) { - checkSmaller(i, i - 1); - } - - // -0.0 +0.0 - checkSmaller(0x80000000, 0); - - // +0.0 ... +infinity - for (int i = 0; i != 0x7f800000; i++) { - checkSmaller(i, i + 1); - } - - // All other are NaNs and should be after positive infinity. - final long infinity = toSortableL(Float.POSITIVE_INFINITY); - for (int i = 0x7f800001; i != 0x7fffffff; i++) { - assertTrue(infinity < toSortableL(Float.intBitsToFloat(i))); - } - for (int i = 0xff800001; i != 0xffffffff; i++) { - assertTrue(infinity < toSortableL(Float.intBitsToFloat(i))); - } - } - - private long toSortableL(float f) { - return FloatMagic.toSortable(f) & 0xffffffffL; - } - - private void checkSmaller(int i1, int i2) { - float f1 = Float.intBitsToFloat(i1); - float f2 = Float.intBitsToFloat(i2); - if (f1 > f2) { - throw new AssertionError(f1 + " " + f2 + " " + i1 + " " + i2); - } - assertTrue(toSortableL(f1) < toSortableL(f2)); - } -} diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java index f4f985328ca..3a7937c8ac9 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java +++ b/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java @@ -20,6 +20,7 @@ package org.apache.lucene.search.suggest.fst; import java.io.*; import java.util.ArrayList; import java.util.Arrays; +import java.util.Comparator; import org.apache.lucene.search.suggest.fst.Sort.BufferSize; import org.apache.lucene.search.suggest.fst.Sort.ByteSequencesWriter; @@ -61,7 +62,7 @@ public class TestSort extends LuceneTestCase { @Test public void testIntermediateMerges() throws Exception { // Sort 20 mb worth of data with 1mb buffer, binary merging. - SortInfo info = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), 2), + SortInfo info = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), 2), generateRandom(Sort.MB * 20)); assertTrue(info.mergeRounds > 10); } @@ -69,7 +70,7 @@ public class TestSort extends LuceneTestCase { @Test public void testSmallRandom() throws Exception { // Sort 20 mb worth of data with 1mb buffer. - SortInfo sortInfo = checkSort(new Sort(BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), + SortInfo sortInfo = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), generateRandom(Sort.MB * 20)); assertEquals(1, sortInfo.mergeRounds); } @@ -77,7 +78,7 @@ public class TestSort extends LuceneTestCase { @Test @Nightly public void testLargerRandom() throws Exception { // Sort 100MB worth of data with 15mb buffer. - checkSort(new Sort(BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), + checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), generateRandom(Sort.MB * 100)); } @@ -92,14 +93,25 @@ public class TestSort extends LuceneTestCase { byte [][] bytes = data.toArray(new byte[data.size()][]); return bytes; } - + + static final Comparator unsignedByteOrderComparator = new Comparator() { + public int compare(byte[] left, byte[] right) { + final int max = Math.min(left.length, right.length); + for (int i = 0, j = 0; i < max; i++, j++) { + int diff = (left[i] & 0xff) - (right[j] & 0xff); + if (diff != 0) + return diff; + } + return left.length - right.length; + } + }; /** * Check sorting data on an instance of {@link Sort}. */ private SortInfo checkSort(Sort sort, byte[][] data) throws IOException { File unsorted = writeAll("unsorted", data); - Arrays.sort(data, Sort.unsignedByteOrderComparator); + Arrays.sort(data, unsignedByteOrderComparator); File golden = writeAll("golden", data); File sorted = new File(tempDir, "sorted"); diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java index 916eeb91557..6cadef3c379 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java +++ b/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java @@ -117,7 +117,7 @@ public class WFSTCompletionTest extends LuceneTestCase { // TODO: could be faster... but its slowCompletor for a reason for (Map.Entry e : slowCompletor.entrySet()) { if (e.getKey().startsWith(prefix)) { - matches.add(new LookupResult(e.getKey(), (float)e.getValue().longValue())); + matches.add(new LookupResult(e.getKey(), e.getValue().longValue())); } } diff --git a/solr/core/src/java/org/apache/solr/spelling/suggest/Suggester.java b/solr/core/src/java/org/apache/solr/spelling/suggest/Suggester.java index 8f0cade9604..525ce3b97dc 100644 --- a/solr/core/src/java/org/apache/solr/spelling/suggest/Suggester.java +++ b/solr/core/src/java/org/apache/solr/spelling/suggest/Suggester.java @@ -153,11 +153,6 @@ public class Suggester extends SolrSpellChecker { build(core, searcher); } - public void add(CharsRef query, int numHits) { - LOG.info("add " + query + ", " + numHits); - lookup.add(query, new Integer(numHits)); - } - static SpellingResult EMPTY_RESULT = new SpellingResult(); @Override @@ -182,7 +177,7 @@ public class Suggester extends SolrSpellChecker { Collections.sort(suggestions); } for (LookupResult lr : suggestions) { - res.add(t, lr.key.toString(), ((Number)lr.value).intValue()); + res.add(t, lr.key.toString(), (int)lr.value); } } return res;