LUCENE-3807: clean up TermFreqIterator API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1291418 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2012-02-20 19:35:59 +00:00
parent 630addb415
commit 1860439f15
27 changed files with 753 additions and 390 deletions

View File

@ -23,6 +23,7 @@ import java.util.Comparator;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
/** Iterator to seek ({@link #seekCeil(BytesRef)}, {@link /** Iterator to seek ({@link #seekCeil(BytesRef)}, {@link
* #seekExact(BytesRef,boolean)}) or step through ({@link * #seekExact(BytesRef,boolean)}) or step through ({@link
@ -40,7 +41,7 @@ import org.apache.lucene.util.BytesRef;
* of the <code>seek</code> methods. * of the <code>seek</code> methods.
* *
* @lucene.experimental */ * @lucene.experimental */
public abstract class TermsEnum { public abstract class TermsEnum implements BytesRefIterator {
private AttributeSource atts = null; private AttributeSource atts = null;
@ -114,14 +115,6 @@ public abstract class TermsEnum {
} }
} }
/** Increments the enumeration to the next term.
* Returns the resulting term, or null if the end was
* hit (which means the enum is unpositioned). The
* returned BytesRef may be re-used across calls to next.
* After this method returns null, do not call it again:
* the results are undefined. */
public abstract BytesRef next() throws IOException;
/** Returns current term. Do not call this when the enum /** Returns current term. Do not call this when the enum
* is unpositioned. */ * is unpositioned. */
public abstract BytesRef term() throws IOException; public abstract BytesRef term() throws IOException;

View File

@ -280,6 +280,37 @@ public final class ByteBlockPool {
} while(true); } while(true);
} }
/**
*
*/
public final BytesRef copyFrom(final BytesRef bytes) {
final int length = bytes.length;
final int offset = bytes.offset;
bytes.offset = 0;
bytes.grow(length);
int bufferIndex = offset >> BYTE_BLOCK_SHIFT;
byte[] buffer = buffers[bufferIndex];
int pos = offset & BYTE_BLOCK_MASK;
int overflow = (pos + length) - BYTE_BLOCK_SIZE;
do {
if (overflow <= 0) {
System.arraycopy(buffer, pos, bytes.bytes, bytes.offset, bytes.length);
bytes.length = length;
bytes.offset = 0;
break;
} else {
final int bytesToCopy = length - overflow;
System.arraycopy(buffer, pos, bytes.bytes, bytes.offset, bytesToCopy);
pos = 0;
bytes.length -= bytesToCopy;
bytes.offset += bytesToCopy;
buffer = buffers[bufferIndex];
overflow = overflow - BYTE_BLOCK_SIZE;
}
} while (true);
return bytes;
}
/** /**
* Writes the pools content to the given {@link DataOutput} * Writes the pools content to the given {@link DataOutput}
*/ */

View File

@ -0,0 +1,52 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
/**
* A simple iterator interface for {@link BytesRef} iteration
*
*/
public interface BytesRefIterator {
public static final BytesRefIterator EMPTY_ITERATOR = new EmptyBytesRefIterator();
/**
* Increments the iteration to the next {@link BytesRef} in the iterator.
* Returns the resulting {@link BytesRef} or <code>null</code> if the end of
* the iterator is reached. The returned BytesRef may be re-used across calls
* to next. After this method returns null, do not call it again: the results
* are undefined.
*
* @return the next {@link BytesRef} in the iterator or <code>null</code> if
* the end of the iterator is reached.
* @throws IOException
*/
public BytesRef next() throws IOException;
public final static class EmptyBytesRefIterator implements BytesRefIterator {
@Override
public BytesRef next() throws IOException {
return null;
}
}
}

View File

@ -16,7 +16,7 @@ package org.apache.lucene.search.spell;
* limitations under the License. * limitations under the License.
*/ */
import java.util.Iterator; import org.apache.lucene.util.BytesRefIterator;
/** /**
* A simple interface representing a Dictionary. A Dictionary * A simple interface representing a Dictionary. A Dictionary
@ -30,5 +30,5 @@ public interface Dictionary {
* Return all words present in the dictionary * Return all words present in the dictionary
* @return Iterator * @return Iterator
*/ */
Iterator<String> getWordsIterator(); BytesRefIterator getWordsIterator();
} }

View File

@ -18,12 +18,14 @@
package org.apache.lucene.search.spell; package org.apache.lucene.search.spell;
import java.io.IOException; import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator; import java.util.Iterator;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.UnicodeUtil;
@ -50,14 +52,13 @@ public class HighFrequencyDictionary implements Dictionary {
this.thresh = thresh; this.thresh = thresh;
} }
public final Iterator<String> getWordsIterator() { public final BytesRefIterator getWordsIterator() {
return new HighFrequencyIterator(); return new HighFrequencyIterator();
} }
final class HighFrequencyIterator implements TermFreqIterator, SortedIterator { final class HighFrequencyIterator implements TermFreqIterator, SortedIterator {
private TermsEnum termsEnum; private final BytesRef spare = new BytesRef();
private BytesRef actualTerm; private final TermsEnum termsEnum;
private boolean hasNextCalled;
private int minNumDocs; private int minNumDocs;
HighFrequencyIterator() { HighFrequencyIterator() {
@ -65,6 +66,8 @@ public class HighFrequencyDictionary implements Dictionary {
Terms terms = MultiFields.getTerms(reader, field); Terms terms = MultiFields.getTerms(reader, field);
if (terms != null) { if (terms != null) {
termsEnum = terms.iterator(null); termsEnum = terms.iterator(null);
} else {
termsEnum = null;
} }
minNumDocs = (int)(thresh * (float)reader.numDocs()); minNumDocs = (int)(thresh * (float)reader.numDocs());
} catch (IOException e) { } catch (IOException e) {
@ -83,57 +86,27 @@ public class HighFrequencyDictionary implements Dictionary {
throw new RuntimeException(ioe); throw new RuntimeException(ioe);
} }
} }
public String next() {
if (!hasNextCalled && !hasNext()) {
return null;
}
hasNextCalled = false;
if (actualTerm == null) {
return null; @Override
} else { public BytesRef next() throws IOException {
UnicodeUtil.UTF8toUTF16(actualTerm, spare); if (termsEnum != null) {
return spare.toString(); BytesRef next = termsEnum.next();
if (next != null && isFrequent(termsEnum.docFreq())) {
spare.copyBytes(next);
return spare;
}
} }
return null;
} }
public boolean hasNext() { @Override
if (hasNextCalled) { public Comparator<BytesRef> comparator() {
return actualTerm != null; try {
return termsEnum.getComparator();
} catch (IOException e) {
throw new RuntimeException(e);
} }
hasNextCalled = true;
if (termsEnum == null) {
return false;
}
while(true) {
try {
actualTerm = termsEnum.next();
} catch (IOException e) {
throw new RuntimeException(e);
}
// if there are no words return false
if (actualTerm == null) {
return false;
}
// got a valid term, does it pass the threshold?
try {
if (isFrequent(termsEnum.docFreq())) {
return true;
}
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
}
public void remove() {
throw new UnsupportedOperationException();
} }
} }
} }

View File

@ -18,13 +18,7 @@ package org.apache.lucene.search.spell;
*/ */
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.BytesRefIterator;
import java.util.Iterator;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.MultiFields;
@ -49,50 +43,18 @@ public class LuceneDictionary implements Dictionary {
this.field = field; this.field = field;
} }
public final Iterator<String> getWordsIterator() { public final BytesRefIterator getWordsIterator() {
return new LuceneIterator();
} try {
final Terms terms = MultiFields.getTerms(reader, field);
if (terms != null) {
final class LuceneIterator implements Iterator<String> { return terms.iterator(null);
private TermsEnum termsEnum; } else {
private BytesRef pendingTerm; return BytesRefIterator.EMPTY_ITERATOR;
private final CharsRef spare = new CharsRef();
LuceneIterator() {
try {
final Terms terms = MultiFields.getTerms(reader, field);
if (terms != null) {
termsEnum = terms.iterator(null);
pendingTerm = termsEnum.next();
}
} catch (IOException e) {
throw new RuntimeException(e);
} }
} } catch (IOException e) {
throw new RuntimeException(e);
public String next() {
if (pendingTerm == null) {
return null;
}
UnicodeUtil.UTF8toUTF16(pendingTerm, spare);
try {
pendingTerm = termsEnum.next();
} catch (IOException e) {
throw new RuntimeException(e);
}
return spare.toString();
}
public boolean hasNext() {
return pendingTerm != null;
}
public void remove() {
throw new UnsupportedOperationException();
} }
} }
} }

View File

@ -21,6 +21,10 @@ package org.apache.lucene.search.spell;
import java.util.Iterator; import java.util.Iterator;
import java.io.*; import java.io.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.IOUtils;
/** /**
* Dictionary represented by a text file. * Dictionary represented by a text file.
@ -33,8 +37,6 @@ import java.io.*;
public class PlainTextDictionary implements Dictionary { public class PlainTextDictionary implements Dictionary {
private BufferedReader in; private BufferedReader in;
private String line;
private boolean hasNextCalled;
public PlainTextDictionary(File file) throws FileNotFoundException { public PlainTextDictionary(File file) throws FileNotFoundException {
in = new BufferedReader(new FileReader(file)); in = new BufferedReader(new FileReader(file));
@ -51,31 +53,37 @@ public class PlainTextDictionary implements Dictionary {
in = new BufferedReader(reader); in = new BufferedReader(reader);
} }
public Iterator<String> getWordsIterator() { public BytesRefIterator getWordsIterator() {
return new fileIterator(); return new FileIterator();
} }
final class fileIterator implements Iterator<String> { final class FileIterator implements BytesRefIterator {
public String next() { private boolean done = false;
if (!hasNextCalled) { private final BytesRef spare = new BytesRef();
hasNext(); @Override
public BytesRef next() throws IOException {
if (done) {
return null;
} }
hasNextCalled = false; boolean success = false;
return line; BytesRef result;
}
public boolean hasNext() {
hasNextCalled = true;
try { try {
line = in.readLine(); String line;
} catch (IOException ex) { if ((line = in.readLine()) != null) {
throw new RuntimeException(ex); spare.copyChars(line);
result = spare;
} else {
done = true;
IOUtils.close(in);
result = null;
}
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(in);
}
} }
return (line != null) ? true : false; return result;
}
public void remove() {
throw new UnsupportedOperationException();
} }
} }

View File

@ -17,12 +17,17 @@ package org.apache.lucene.search.spell;
* limitations under the License. * limitations under the License.
*/ */
import java.util.Comparator;
import java.util.Iterator; import java.util.Iterator;
import org.apache.lucene.util.BytesRef;
/** /**
* Marker interface to signal that elements coming from {@link Iterator} * Marker interface to signal that elements coming from {@link Iterator}
* come in ascending lexicographic order. * come in ascending lexicographic order.
*/ */
public interface SortedIterator { public interface SortedIterator {
public Comparator<BytesRef> comparator();
} }

View File

@ -46,6 +46,7 @@ import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -510,20 +511,18 @@ public class SpellChecker implements java.io.Closeable {
boolean isEmpty = termsEnums.isEmpty(); boolean isEmpty = termsEnums.isEmpty();
try { try {
Iterator<String> iter = dict.getWordsIterator(); BytesRefIterator iter = dict.getWordsIterator();
BytesRef currentTerm = new BytesRef(); BytesRef currentTerm;
terms: while (iter.hasNext()) { terms: while ((currentTerm = iter.next()) != null) {
String word = iter.next();
String word = currentTerm.utf8ToString();
int len = word.length(); int len = word.length();
if (len < 3) { if (len < 3) {
continue; // too short we bail but "too long" is fine... continue; // too short we bail but "too long" is fine...
} }
if (!isEmpty) { if (!isEmpty) {
// we have a non-empty index, check if the term exists
currentTerm.copyChars(word);
for (TermsEnum te : termsEnums) { for (TermsEnum te : termsEnums) {
if (te.seekExact(currentTerm, false)) { if (te.seekExact(currentTerm, false)) {
continue terms; continue terms;

View File

@ -17,16 +17,18 @@ package org.apache.lucene.search.spell;
* limitations under the License. * limitations under the License.
*/ */
import java.util.Iterator; import java.io.IOException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
public interface TermFreqIterator extends Iterator<String> { public interface TermFreqIterator extends BytesRefIterator {
public float freq(); public float freq();
public static class TermFreqIteratorWrapper implements TermFreqIterator { public static class TermFreqIteratorWrapper implements TermFreqIterator {
private Iterator<String> wrapped; private BytesRefIterator wrapped;
public TermFreqIteratorWrapper(Iterator<String> wrapped) { public TermFreqIteratorWrapper(BytesRefIterator wrapped) {
this.wrapped = wrapped; this.wrapped = wrapped;
} }
@ -34,17 +36,8 @@ public interface TermFreqIterator extends Iterator<String> {
return 1.0f; return 1.0f;
} }
public boolean hasNext() { public BytesRef next() throws IOException {
return wrapped.hasNext(); return wrapped.next();
} }
public String next() {
return wrapped.next().toString();
}
public void remove() {
throw new UnsupportedOperationException();
}
} }
} }

View File

@ -17,65 +17,46 @@ package org.apache.lucene.search.suggest;
* limitations under the License. * limitations under the License.
*/ */
import java.util.ArrayList; import java.io.IOException;
import java.util.List;
import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
/** /**
* This wrapper buffers incoming elements. * This wrapper buffers incoming elements.
*/ */
public class BufferingTermFreqIteratorWrapper implements TermFreqIterator { public class BufferingTermFreqIteratorWrapper implements TermFreqIterator {
/** Entry in the buffer. */ protected BytesRefList entries = new BytesRefList();
public static final class Entry implements Comparable<Entry> { protected int curPos = -1;
String word; protected float[] freqs = new float[1];
float freq; private final BytesRef spare = new BytesRef();
public BufferingTermFreqIteratorWrapper(TermFreqIterator source) throws IOException {
public Entry(String word, float freq) { BytesRef spare;
this.word = word; int freqIndex = 0;
this.freq = freq; while((spare = source.next()) != null) {
entries.append(spare);
if (freqIndex >= freqs.length) {
freqs = ArrayUtil.grow(freqs, freqs.length+1);
}
freqs[freqIndex++] = source.freq();
} }
public int compareTo(Entry o) {
return word.compareTo(o.word);
}
}
protected ArrayList<Entry> entries = new ArrayList<Entry>();
protected int curPos;
protected Entry curEntry;
public BufferingTermFreqIteratorWrapper(TermFreqIterator source) {
// read all source data into buffer
while (source.hasNext()) {
String w = source.next();
Entry e = new Entry(w, source.freq());
entries.add(e);
}
curPos = 0;
} }
public float freq() { public float freq() {
return curEntry.freq; return freqs[curPos];
} }
public boolean hasNext() { @Override
return curPos < entries.size(); public BytesRef next() throws IOException {
if (++curPos < entries.size()) {
entries.get(spare, curPos);
return spare;
}
return null;
} }
public String next() {
curEntry = entries.get(curPos);
curPos++;
return curEntry.word;
}
public void remove() {
throw new UnsupportedOperationException("remove is not supported");
}
public List<Entry> entries() {
return entries;
}
} }

View File

@ -0,0 +1,125 @@
package org.apache.lucene.search.suggest;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.SorterTemplate;
final class BytesRefList {
private final ByteBlockPool pool;
private int[] offsets = new int[1];
private int currentElement = 0;
private int currentOffset = 0;
public BytesRefList() {
this(new ByteBlockPool(new ByteBlockPool.DirectAllocator()));
}
public BytesRefList(ByteBlockPool pool) {
this.pool = pool;
pool.nextBuffer();
}
public int append(BytesRef bytes) {
if (currentElement >= offsets.length) {
offsets = ArrayUtil.grow(offsets, offsets.length + 1);
}
pool.copy(bytes);
offsets[currentElement++] = currentOffset;
currentOffset += bytes.length;
return currentElement;
}
public int size() {
return currentElement;
}
public BytesRef get(BytesRef bytes, int pos) {
if (currentElement > pos) {
bytes.offset = offsets[pos];
bytes.length = pos == currentElement - 1 ? currentOffset - bytes.offset
: offsets[pos + 1] - bytes.offset;
pool.copyFrom(bytes);
return bytes;
}
throw new IndexOutOfBoundsException("index " + pos
+ " must be less than the size: " + currentElement);
}
public BytesRefIterator iterator() {
final int numElements = currentElement;
return new BytesRefIterator() {
private final BytesRef spare = new BytesRef();
private int pos = 0;
@Override
public BytesRef next() throws IOException {
if (pos < numElements) {
get(spare, pos++);
return spare;
}
return null;
}
};
}
public int[] sort(final Comparator<BytesRef> comp) {
final int[] orderdEntries = new int[size()];
for (int i = 0; i < orderdEntries.length; i++) {
orderdEntries[i] = i;
}
new SorterTemplate() {
@Override
protected void swap(int i, int j) {
final int o = orderdEntries[i];
orderdEntries[i] = orderdEntries[j];
orderdEntries[j] = o;
}
@Override
protected int compare(int i, int j) {
final int ord1 = orderdEntries[i], ord2 = orderdEntries[j];
return comp.compare(get(scratch1, ord1), get(scratch2, ord2));
}
@Override
protected void setPivot(int i) {
final int ord = orderdEntries[i];
get(pivot, ord);
}
@Override
protected int comparePivot(int j) {
final int ord = orderdEntries[j];
return comp.compare(pivot, get(scratch2, ord));
}
private final BytesRef pivot = new BytesRef(),
scratch1 = new BytesRef(), scratch2 = new BytesRef();
}.quickSort(0, size() - 1);
return orderdEntries;
}
}

View File

@ -22,6 +22,8 @@ import java.io.*;
import org.apache.lucene.search.spell.Dictionary; import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/** /**
@ -36,7 +38,7 @@ public class FileDictionary implements Dictionary {
private BufferedReader in; private BufferedReader in;
private String line; private String line;
private boolean hasNextCalled; private boolean done = false;
public FileDictionary(InputStream dictFile) { public FileDictionary(InputStream dictFile) {
in = new BufferedReader(new InputStreamReader(dictFile)); in = new BufferedReader(new InputStreamReader(dictFile));
@ -50,45 +52,39 @@ public class FileDictionary implements Dictionary {
} }
public TermFreqIterator getWordsIterator() { public TermFreqIterator getWordsIterator() {
return new fileIterator(); return new FileIterator();
} }
final class fileIterator implements TermFreqIterator { final class FileIterator implements TermFreqIterator {
private float curFreq; private float curFreq;
private final BytesRef spare = new BytesRef();
public String next() {
if (!hasNextCalled) {
hasNext();
}
hasNextCalled = false;
return line;
}
public float freq() { public float freq() {
return curFreq; return curFreq;
} }
public boolean hasNext() { @Override
hasNextCalled = true; public BytesRef next() throws IOException {
try { if (done) {
line = in.readLine(); return null;
if (line != null) { }
String[] fields = line.split("\t"); line = in.readLine();
if (fields.length > 1) { if (line != null) {
curFreq = Float.parseFloat(fields[1]); String[] fields = line.split("\t");
line = fields[0]; if (fields.length > 1) {
} else { curFreq = Float.parseFloat(fields[1]);
curFreq = 1; spare.copyChars(fields[0]);
} } else {
} spare.copyChars(line);
} catch (IOException ex) { curFreq = 1;
throw new RuntimeException(ex); }
return spare;
} else {
done = true;
IOUtils.close(in);
return null;
} }
return (line != null) ? true : false;
}
public void remove() {
throw new UnsupportedOperationException();
} }
} }

View File

@ -19,11 +19,13 @@ package org.apache.lucene.search.suggest;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.io.InputStream;
import java.io.OutputStream;
import java.util.List; import java.util.List;
import org.apache.lucene.search.spell.Dictionary; import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
public abstract class Lookup { public abstract class Lookup {
@ -77,7 +79,7 @@ public abstract class Lookup {
* {@link UnsortedTermFreqIteratorWrapper} in such case. * {@link UnsortedTermFreqIteratorWrapper} in such case.
*/ */
public void build(Dictionary dict) throws IOException { public void build(Dictionary dict) throws IOException {
Iterator<String> it = dict.getWordsIterator(); BytesRefIterator it = dict.getWordsIterator();
TermFreqIterator tfit; TermFreqIterator tfit;
if (it instanceof TermFreqIterator) { if (it instanceof TermFreqIterator) {
tfit = (TermFreqIterator)it; tfit = (TermFreqIterator)it;
@ -89,6 +91,52 @@ public abstract class Lookup {
public abstract void build(TermFreqIterator tfit) throws IOException; public abstract void build(TermFreqIterator tfit) throws IOException;
/**
* Look up a key and return possible completion for this key.
* @param key lookup key. Depending on the implementation this may be
* a prefix, misspelling, or even infix.
* @param onlyMorePopular return only more popular results
* @param num maximum number of results to return
* @return a list of possible completions, with their relative weight (e.g. popularity)
*/
// TODO: this should be a BytesRef API?
public abstract List<LookupResult> lookup(String key, boolean onlyMorePopular, int num);
/**
* Modify the lookup data by recording additional data. Optional operation.
* @param key new lookup key
* @param value value to associate with this key
* @return true if new key is added, false if it already exists or operation
* is not supported.
*/
// TODO: this should be a BytesRef API?
public abstract boolean add(String key, Object value);
/**
* Get value associated with a specific key.
* @param key lookup key
* @return associated value
*/
// TODO: this should be a BytesRef API?
public abstract Object get(String key);
/**
* Persist the constructed lookup data to a directory. Optional operation.
* @param output {@link OutputStream} to write the data to.
* @return true if successful, false if unsuccessful or not supported.
* @throws IOException when fatal IO error occurs.
*/
public abstract boolean store(OutputStream output) throws IOException;
/**
* Discard current lookup data and load it from a previously saved copy.
* Optional operation.
* @param input the {@link InputStream} to load the lookup data.
* @return true if completed successfully, false if unsuccessful or not supported.
* @throws IOException when fatal IO error occurs.
*/
public abstract boolean load(InputStream input) throws IOException;
/** /**
* Persist the constructed lookup data to a directory. Optional operation. * Persist the constructed lookup data to a directory. Optional operation.
* @param storeDir directory where data can be stored. * @param storeDir directory where data can be stored.
@ -105,30 +153,4 @@ public abstract class Lookup {
* @throws IOException when fatal IO error occurs. * @throws IOException when fatal IO error occurs.
*/ */
public abstract boolean load(File storeDir) throws IOException; public abstract boolean load(File storeDir) throws IOException;
/**
* Look up a key and return possible completion for this key.
* @param key lookup key. Depending on the implementation this may be
* a prefix, misspelling, or even infix.
* @param onlyMorePopular return only more popular results
* @param num maximum number of results to return
* @return a list of possible completions, with their relative weight (e.g. popularity)
*/
public abstract List<LookupResult> lookup(String key, boolean onlyMorePopular, int num);
/**
* Modify the lookup data by recording additional data. Optional operation.
* @param key new lookup key
* @param value value to associate with this key
* @return true if new key is added, false if it already exists or operation
* is not supported.
*/
public abstract boolean add(String key, Object value);
/**
* Get value associated with a specific key.
* @param key lookup key
* @return associated value
*/
public abstract Object get(String key);
} }

View File

@ -17,10 +17,12 @@ package org.apache.lucene.search.suggest;
* limitations under the License. * limitations under the License.
*/ */
import java.util.Collections; import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.search.spell.SortedIterator; import org.apache.lucene.search.spell.SortedIterator;
import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.BytesRef;
/** /**
* This wrapper buffers incoming elements and makes sure they are sorted in * This wrapper buffers incoming elements and makes sure they are sorted in
@ -28,8 +30,35 @@ import org.apache.lucene.search.spell.TermFreqIterator;
*/ */
public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper implements SortedIterator { public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper implements SortedIterator {
public SortedTermFreqIteratorWrapper(TermFreqIterator source) { private final int[] sortedOrds;
private int currentOrd = -1;
private final BytesRef spare = new BytesRef();
private final Comparator<BytesRef> comp;
public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator<BytesRef> comp) throws IOException {
super(source); super(source);
Collections.sort(entries); this.sortedOrds = entries.sort(comp);
this.comp = comp;
} }
@Override
public float freq() {
return freqs[currentOrd];
}
@Override
public BytesRef next() throws IOException {
if (++curPos < entries.size()) {
return entries.get(spare, (currentOrd = sortedOrds[curPos]));
}
return null;
}
@Override
public Comparator<BytesRef> comparator() {
return comp;
}
} }

View File

@ -17,9 +17,11 @@ package org.apache.lucene.search.suggest;
* limitations under the License. * limitations under the License.
*/ */
import java.util.Collections; import java.io.IOException;
import java.util.Random;
import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.BytesRef;
/** /**
* This wrapper buffers the incoming elements and makes sure they are in * This wrapper buffers the incoming elements and makes sure they are in
@ -27,8 +29,34 @@ import org.apache.lucene.search.spell.TermFreqIterator;
*/ */
public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper { public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper {
public UnsortedTermFreqIteratorWrapper(TermFreqIterator source) { private final int[] ords;
private int currentOrd = -1;
private final BytesRef spare = new BytesRef();
public UnsortedTermFreqIteratorWrapper(TermFreqIterator source) throws IOException {
super(source); super(source);
Collections.shuffle(entries); ords = new int[entries.size()];
Random random = new Random();
for (int i = 0; i < ords.length; i++) {
ords[i] = i;
}
for (int i = 0; i < ords.length; i++) {
int randomPosition = random.nextInt(ords.length);
int temp = ords[i];
ords[i] = ords[randomPosition];
ords[randomPosition] = temp;
}
}
@Override
public float freq() {
return freqs[currentOrd];
}
@Override
public BytesRef next() throws IOException {
if (++curPos < entries.size()) {
return entries.get(spare, (currentOrd = ords[curPos]));
}
return null;
} }
} }

View File

@ -19,6 +19,8 @@ package org.apache.lucene.search.suggest.fst;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -29,6 +31,8 @@ import org.apache.lucene.search.suggest.fst.Sort.SortInfo;
import org.apache.lucene.search.suggest.tst.TSTLookup; import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.*; import org.apache.lucene.util.*;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.NoOutputs; import org.apache.lucene.util.fst.NoOutputs;
@ -158,20 +162,17 @@ public class FSTCompletionLookup extends Lookup {
// If negative floats are allowed some trickery needs to be done to find their byte order. // If negative floats are allowed some trickery needs to be done to find their byte order.
boolean success = false; boolean success = false;
try { try {
BytesRef tmp1 = new BytesRef();
byte [] buffer = new byte [0]; byte [] buffer = new byte [0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
while (tfit.hasNext()) { BytesRef spare;
String key = tfit.next(); while ((spare = tfit.next()) != null) {
UnicodeUtil.UTF16toUTF8(key, 0, key.length(), tmp1); if (spare.length + 4 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, spare.length + 4);
if (tmp1.length + 4 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, tmp1.length + 4);
} }
output.reset(buffer); output.reset(buffer);
output.writeInt(FloatMagic.toSortable(tfit.freq())); output.writeInt(FloatMagic.toSortable(tfit.freq()));
output.writeBytes(tmp1.bytes, tmp1.offset, tmp1.length); output.writeBytes(spare.bytes, spare.offset, spare.length);
writer.write(buffer, 0, output.getPosition()); writer.write(buffer, 0, output.getPosition());
} }
writer.close(); writer.close();
@ -189,6 +190,7 @@ public class FSTCompletionLookup extends Lookup {
int previousBucket = 0; int previousBucket = 0;
float previousScore = 0; float previousScore = 0;
ByteArrayDataInput input = new ByteArrayDataInput(); ByteArrayDataInput input = new ByteArrayDataInput();
BytesRef tmp1 = new BytesRef();
BytesRef tmp2 = new BytesRef(); BytesRef tmp2 = new BytesRef();
while (reader.read(tmp1)) { while (reader.read(tmp1)) {
input.reset(tmp1.bytes); input.reset(tmp1.bytes);
@ -293,4 +295,30 @@ public class FSTCompletionLookup extends Lookup {
normalCompletion.getFST().save(new File(storeDir, FILENAME)); normalCompletion.getFST().save(new File(storeDir, FILENAME));
return true; return true;
} }
@Override
public synchronized boolean store(OutputStream output) throws IOException {
if (this.normalCompletion == null)
return false;
try {
normalCompletion.getFST().save(new OutputStreamDataOutput(output));
} finally {
IOUtils.close(output);
}
return true;
}
@Override
public synchronized boolean load(InputStream input) throws IOException {
try {
this.higherWeightsCompletion = new FSTCompletion(new FST<Object>(
new InputStreamDataInput(input), NoOutputs.getSingleton()));
this.normalCompletion = new FSTCompletion(
higherWeightsCompletion.getFST(), false, exactMatchFirst);
} finally {
IOUtils.close(input);
}
return true;
}
} }

View File

@ -19,6 +19,8 @@ package org.apache.lucene.search.suggest.fst;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
@ -27,11 +29,12 @@ import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.Arc;
@ -109,16 +112,14 @@ public class WFSTCompletionLookup extends Lookup {
try { try {
byte [] buffer = new byte [0]; byte [] buffer = new byte [0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
while (iterator.hasNext()) { BytesRef spare;
String key = iterator.next(); while ((spare = iterator.next()) != null) {
UnicodeUtil.UTF16toUTF8(key, 0, key.length(), scratch); if (spare.length + 5 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, spare.length + 5);
if (scratch.length + 5 >= buffer.length) {
buffer = ArrayUtil.grow(buffer, scratch.length + 5);
} }
output.reset(buffer); output.reset(buffer);
output.writeBytes(scratch.bytes, scratch.offset, scratch.length); output.writeBytes(spare.bytes, spare.offset, spare.length);
output.writeByte((byte)0); // separator: not used, just for sort order output.writeByte((byte)0); // separator: not used, just for sort order
output.writeInt((int)encodeWeight(iterator.freq())); output.writeInt((int)encodeWeight(iterator.freq()));
writer.write(buffer, 0, output.getPosition()); writer.write(buffer, 0, output.getPosition());
@ -177,6 +178,26 @@ public class WFSTCompletionLookup extends Lookup {
this.fst = FST.read(new File(storeDir, FILENAME), PositiveIntOutputs.getSingleton(true)); this.fst = FST.read(new File(storeDir, FILENAME), PositiveIntOutputs.getSingleton(true));
return true; return true;
} }
@Override
public boolean store(OutputStream output) throws IOException {
try {
fst.save(new OutputStreamDataOutput(output));
} finally {
IOUtils.close(output);
}
return true;
}
@Override
public boolean load(InputStream input) throws IOException {
try {
this.fst = new FST<Long>(new InputStreamDataInput(input), PositiveIntOutputs.getSingleton(true));
} finally {
IOUtils.close(input);
}
return true;
}
@Override @Override
public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) { public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) {

View File

@ -23,6 +23,8 @@ import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -31,6 +33,10 @@ import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.UnsortedTermFreqIteratorWrapper; import org.apache.lucene.search.suggest.UnsortedTermFreqIteratorWrapper;
import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie.TSTNode; import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie.TSTNode;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.UnicodeUtil;
public class JaspellLookup extends Lookup { public class JaspellLookup extends Lookup {
JaspellTernarySearchTrie trie = new JaspellTernarySearchTrie(); JaspellTernarySearchTrie trie = new JaspellTernarySearchTrie();
@ -41,17 +47,22 @@ public class JaspellLookup extends Lookup {
public void build(TermFreqIterator tfit) throws IOException { public void build(TermFreqIterator tfit) throws IOException {
if (tfit instanceof SortedIterator) { if (tfit instanceof SortedIterator) {
// make sure it's unsorted // make sure it's unsorted
// WTF - this could result in yet another sorted iteration....
tfit = new UnsortedTermFreqIteratorWrapper(tfit); tfit = new UnsortedTermFreqIteratorWrapper(tfit);
} }
trie = new JaspellTernarySearchTrie(); trie = new JaspellTernarySearchTrie();
trie.setMatchAlmostDiff(editDistance); trie.setMatchAlmostDiff(editDistance);
while (tfit.hasNext()) { BytesRef spare;
String key = tfit.next(); final CharsRef charsSpare = new CharsRef();
while ((spare = tfit.next()) != null) {
float freq = tfit.freq(); float freq = tfit.freq();
if (key.length() == 0) { if (spare.length == 0) {
continue; continue;
} }
trie.put(key, new Float(freq)); charsSpare.grow(spare.length);
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
trie.put(charsSpare.toString(), new Float(freq));
} }
} }
@ -114,15 +125,7 @@ public class JaspellLookup extends Lookup {
if (!data.exists() || !data.canRead()) { if (!data.exists() || !data.canRead()) {
return false; return false;
} }
DataInputStream in = new DataInputStream(new FileInputStream(data)); return load(new FileInputStream(data));
TSTNode root = trie.new TSTNode('\0', null);
try {
readRecursively(in, root);
trie.setRoot(root);
} finally {
in.close();
}
return true;
} }
private void readRecursively(DataInputStream in, TSTNode node) throws IOException { private void readRecursively(DataInputStream in, TSTNode node) throws IOException {
@ -153,19 +156,8 @@ public class JaspellLookup extends Lookup {
if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) { if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
return false; return false;
} }
TSTNode root = trie.getRoot();
if (root == null) { // empty tree
return false;
}
File data = new File(storeDir, FILENAME); File data = new File(storeDir, FILENAME);
DataOutputStream out = new DataOutputStream(new FileOutputStream(data)); return store(new FileOutputStream(data));
try {
writeRecursively(out, root);
out.flush();
} finally {
out.close();
}
return true;
} }
private void writeRecursively(DataOutputStream out, TSTNode node) throws IOException { private void writeRecursively(DataOutputStream out, TSTNode node) throws IOException {
@ -186,4 +178,33 @@ public class JaspellLookup extends Lookup {
writeRecursively(out, node.relatives[TSTNode.EQKID]); writeRecursively(out, node.relatives[TSTNode.EQKID]);
writeRecursively(out, node.relatives[TSTNode.HIKID]); writeRecursively(out, node.relatives[TSTNode.HIKID]);
} }
@Override
public boolean store(OutputStream output) throws IOException {
TSTNode root = trie.getRoot();
if (root == null) { // empty tree
return false;
}
DataOutputStream out = new DataOutputStream(output);
try {
writeRecursively(out, root);
out.flush();
} finally {
IOUtils.close(out);
}
return true;
}
@Override
public boolean load(InputStream input) throws IOException {
DataInputStream in = new DataInputStream(input);
TSTNode root = trie.new TSTNode('\0', null);
try {
readRecursively(in, root);
trie.setRoot(root);
} finally {
IOUtils.close(in);
}
return true;
}
} }

View File

@ -23,6 +23,8 @@ import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -30,6 +32,10 @@ import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper; import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper;
import org.apache.lucene.search.spell.SortedIterator; import org.apache.lucene.search.spell.SortedIterator;
import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.UnicodeUtil;
public class TSTLookup extends Lookup { public class TSTLookup extends Lookup {
TernaryTreeNode root = new TernaryTreeNode(); TernaryTreeNode root = new TernaryTreeNode();
@ -39,15 +45,19 @@ public class TSTLookup extends Lookup {
public void build(TermFreqIterator tfit) throws IOException { public void build(TermFreqIterator tfit) throws IOException {
root = new TernaryTreeNode(); root = new TernaryTreeNode();
// buffer first // buffer first
if (!(tfit instanceof SortedIterator)) { if ((!(tfit instanceof SortedIterator)) || ((SortedIterator)tfit).comparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) {
// make sure it's sorted // make sure it's sorted and the comparator uses UTF16 sort order
tfit = new SortedTermFreqIteratorWrapper(tfit); tfit = new SortedTermFreqIteratorWrapper(tfit, BytesRef.getUTF8SortedAsUTF16Comparator());
} }
ArrayList<String> tokens = new ArrayList<String>(); ArrayList<String> tokens = new ArrayList<String>();
ArrayList<Float> vals = new ArrayList<Float>(); ArrayList<Float> vals = new ArrayList<Float>();
while (tfit.hasNext()) { BytesRef spare;
tokens.add(tfit.next()); CharsRef charsSpare = new CharsRef();
while ((spare = tfit.next()) != null) {
charsSpare.grow(spare.length);
UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
tokens.add(charsSpare.toString());
vals.add(new Float(tfit.freq())); vals.add(new Float(tfit.freq()));
} }
autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root); autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
@ -113,14 +123,7 @@ public class TSTLookup extends Lookup {
if (!data.exists() || !data.canRead()) { if (!data.exists() || !data.canRead()) {
return false; return false;
} }
DataInputStream in = new DataInputStream(new FileInputStream(data)); return load(new FileInputStream(data));
root = new TernaryTreeNode();
try {
readRecursively(in, root);
} finally {
in.close();
}
return true;
} }
// pre-order traversal // pre-order traversal
@ -153,14 +156,7 @@ public class TSTLookup extends Lookup {
return false; return false;
} }
File data = new File(storeDir, FILENAME); File data = new File(storeDir, FILENAME);
DataOutputStream out = new DataOutputStream(new FileOutputStream(data)); return store(new FileOutputStream(data));
try {
writeRecursively(out, root);
out.flush();
} finally {
out.close();
}
return true;
} }
// pre-order traversal // pre-order traversal
@ -188,4 +184,28 @@ public class TSTLookup extends Lookup {
writeRecursively(out, node.hiKid); writeRecursively(out, node.hiKid);
} }
} }
@Override
public synchronized boolean store(OutputStream output) throws IOException {
DataOutputStream out = new DataOutputStream(output);
try {
writeRecursively(out, root);
out.flush();
} finally {
IOUtils.close(output);
}
return true;
}
@Override
public synchronized boolean load(InputStream input) throws IOException {
DataInputStream in = new DataInputStream(input);
root = new TernaryTreeNode();
try {
readRecursively(in, root);
} finally {
IOUtils.close(in);
}
return true;
}
} }

View File

@ -18,15 +18,17 @@ package org.apache.lucene.search.spell;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField; import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
/** /**
@ -40,7 +42,8 @@ public class TestLuceneDictionary extends LuceneTestCase {
private IndexReader indexReader = null; private IndexReader indexReader = null;
private LuceneDictionary ld; private LuceneDictionary ld;
private Iterator<String> it; private BytesRefIterator it;
private BytesRef spare = new BytesRef();
@Override @Override
public void setUp() throws Exception { public void setUp() throws Exception {
@ -84,13 +87,12 @@ public class TestLuceneDictionary extends LuceneTestCase {
public void testFieldNonExistent() throws IOException { public void testFieldNonExistent() throws IOException {
try { try {
indexReader = IndexReader.open(store); indexReader = DirectoryReader.open(store);
ld = new LuceneDictionary(indexReader, "nonexistent_field"); ld = new LuceneDictionary(indexReader, "nonexistent_field");
it = ld.getWordsIterator(); it = ld.getWordsIterator();
assertFalse("More elements than expected", it.hasNext()); assertNull("More elements than expected", spare = it.next());
assertTrue("Nonexistent element is really null", it.next() == null);
} finally { } finally {
if (indexReader != null) { indexReader.close(); } if (indexReader != null) { indexReader.close(); }
} }
@ -98,15 +100,13 @@ public class TestLuceneDictionary extends LuceneTestCase {
public void testFieldAaa() throws IOException { public void testFieldAaa() throws IOException {
try { try {
indexReader = IndexReader.open(store); indexReader = DirectoryReader.open(store);
ld = new LuceneDictionary(indexReader, "aaa"); ld = new LuceneDictionary(indexReader, "aaa");
it = ld.getWordsIterator(); it = ld.getWordsIterator();
assertNotNull("First element doesn't exist.", spare = it.next());
assertTrue("First element doesn't exist.", it.hasNext()); assertTrue("First element isn't correct", spare.utf8ToString().equals("foo"));
assertTrue("First element isn't correct", it.next().equals("foo")); assertNull("More elements than expected", it.next());
assertFalse("More elements than expected", it.hasNext());
assertTrue("Nonexistent element is really null", it.next() == null);
} finally { } finally {
if (indexReader != null) { indexReader.close(); } if (indexReader != null) { indexReader.close(); }
} }
@ -114,24 +114,22 @@ public class TestLuceneDictionary extends LuceneTestCase {
public void testFieldContents_1() throws IOException { public void testFieldContents_1() throws IOException {
try { try {
indexReader = IndexReader.open(store); indexReader = DirectoryReader.open(store);
ld = new LuceneDictionary(indexReader, "contents"); ld = new LuceneDictionary(indexReader, "contents");
it = ld.getWordsIterator(); it = ld.getWordsIterator();
assertTrue("First element doesn't exist.", it.hasNext()); assertNotNull("First element doesn't exist.", spare = it.next());
assertTrue("First element isn't correct", it.next().equals("Jerry")); assertTrue("First element isn't correct", spare.utf8ToString().equals("Jerry"));
assertTrue("Second element doesn't exist.", it.hasNext()); assertNotNull("Second element doesn't exist.", spare = it.next());
assertTrue("Second element isn't correct", it.next().equals("Tom")); assertTrue("Second element isn't correct", spare.utf8ToString().equals("Tom"));
assertFalse("More elements than expected", it.hasNext()); assertNull("More elements than expected", it.next());
assertTrue("Nonexistent element is really null", it.next() == null);
ld = new LuceneDictionary(indexReader, "contents"); ld = new LuceneDictionary(indexReader, "contents");
it = ld.getWordsIterator(); it = ld.getWordsIterator();
int counter = 2; int counter = 2;
while (it.hasNext()) { while (it.next() != null) {
it.next();
counter--; counter--;
} }
@ -144,30 +142,15 @@ public class TestLuceneDictionary extends LuceneTestCase {
public void testFieldContents_2() throws IOException { public void testFieldContents_2() throws IOException {
try { try {
indexReader = IndexReader.open(store); indexReader = DirectoryReader.open(store);
ld = new LuceneDictionary(indexReader, "contents"); ld = new LuceneDictionary(indexReader, "contents");
it = ld.getWordsIterator(); it = ld.getWordsIterator();
// hasNext() should have no side effects
assertTrue("First element isn't were it should be.", it.hasNext());
assertTrue("First element isn't were it should be.", it.hasNext());
assertTrue("First element isn't were it should be.", it.hasNext());
// just iterate through words // just iterate through words
assertTrue("First element isn't correct", it.next().equals("Jerry")); assertEquals("First element isn't correct", "Jerry", it.next().utf8ToString());
assertTrue("Second element isn't correct", it.next().equals("Tom")); assertEquals("Second element isn't correct", "Tom", it.next().utf8ToString());
assertTrue("Nonexistent element is really null", it.next() == null); assertNull("Nonexistent element is really null", it.next());
// hasNext() should still have no side effects ...
assertFalse("There should be any more elements", it.hasNext());
assertFalse("There should be any more elements", it.hasNext());
assertFalse("There should be any more elements", it.hasNext());
// .. and there are really no more words
assertTrue("Nonexistent element is really null", it.next() == null);
assertTrue("Nonexistent element is really null", it.next() == null);
assertTrue("Nonexistent element is really null", it.next() == null);
} }
finally { finally {
if (indexReader != null) { indexReader.close(); } if (indexReader != null) { indexReader.close(); }
@ -176,15 +159,14 @@ public class TestLuceneDictionary extends LuceneTestCase {
public void testFieldZzz() throws IOException { public void testFieldZzz() throws IOException {
try { try {
indexReader = IndexReader.open(store); indexReader = DirectoryReader.open(store);
ld = new LuceneDictionary(indexReader, "zzz"); ld = new LuceneDictionary(indexReader, "zzz");
it = ld.getWordsIterator(); it = ld.getWordsIterator();
assertTrue("First element doesn't exist.", it.hasNext()); assertNotNull("First element doesn't exist.", spare = it.next());
assertTrue("First element isn't correct", it.next().equals("bar")); assertEquals("First element isn't correct", "bar", spare.utf8ToString());
assertFalse("More elements than expected", it.hasNext()); assertNull("More elements than expected", it.next());
assertTrue("Nonexistent element is really null", it.next() == null);
} }
finally { finally {
if (indexReader != null) { indexReader.close(); } if (indexReader != null) { indexReader.close(); }
@ -194,7 +176,7 @@ public class TestLuceneDictionary extends LuceneTestCase {
public void testSpellchecker() throws IOException { public void testSpellchecker() throws IOException {
Directory dir = newDirectory(); Directory dir = newDirectory();
SpellChecker sc = new SpellChecker(dir); SpellChecker sc = new SpellChecker(dir);
indexReader = IndexReader.open(store); indexReader = DirectoryReader.open(store);
sc.indexDictionary(new LuceneDictionary(indexReader, "contents"), newIndexWriterConfig(TEST_VERSION_CURRENT, null), false); sc.indexDictionary(new LuceneDictionary(indexReader, "contents"), newIndexWriterConfig(TEST_VERSION_CURRENT, null), false);
String[] suggestions = sc.suggestSimilar("Tam", 1); String[] suggestions = sc.suggestSimilar("Tam", 1);
assertEquals(1, suggestions.length); assertEquals(1, suggestions.length);

View File

@ -191,7 +191,7 @@ public class LookupBenchmarkTest extends LuceneTestCase {
final List<String> input = new ArrayList<String>(benchmarkInput.size()); final List<String> input = new ArrayList<String>(benchmarkInput.size());
for (TermFreq tf : benchmarkInput) { for (TermFreq tf : benchmarkInput) {
input.add(tf.term.substring(0, Math.min(tf.term.length(), input.add(tf.term.utf8ToString().substring(0, Math.min(tf.term.length,
minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)))); minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1))));
} }

View File

@ -75,11 +75,11 @@ public class PersistenceTest extends LuceneTestCase {
// Assert validity. // Assert validity.
float previous = Float.NEGATIVE_INFINITY; float previous = Float.NEGATIVE_INFINITY;
for (TermFreq k : keys) { for (TermFreq k : keys) {
Float val = (Float) lookup.get(k.term); Float val = (Float) lookup.get(k.term.utf8ToString());
assertNotNull(k.term, val); assertNotNull(k.term.utf8ToString(), val);
if (supportsExactWeights) { if (supportsExactWeights) {
assertEquals(k.term, Float.valueOf(k.v), val); assertEquals(k.term.utf8ToString(), Float.valueOf(k.v), val);
} else { } else {
assertTrue(val + ">=" + previous, val >= previous); assertTrue(val + ">=" + previous, val >= previous);
previous = val.floatValue(); previous = val.floatValue();

View File

@ -1,5 +1,7 @@
package org.apache.lucene.search.suggest; package org.apache.lucene.search.suggest;
import org.apache.lucene.util.BytesRef;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -18,10 +20,14 @@ package org.apache.lucene.search.suggest;
*/ */
public final class TermFreq { public final class TermFreq {
public final String term; public final BytesRef term;
public final float v; public final float v;
public TermFreq(String term, float v) { public TermFreq(String term, float v) {
this(new BytesRef(term), v);
}
public TermFreq(BytesRef term, float v) {
this.term = term; this.term = term;
this.v = v; this.v = v;
} }

View File

@ -17,10 +17,12 @@ package org.apache.lucene.search.suggest;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Iterator; import java.util.Iterator;
import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.util.BytesRef;
/** /**
* A {@link TermFreqIterator} over a sequence of {@link TermFreq}s. * A {@link TermFreqIterator} over a sequence of {@link TermFreq}s.
@ -28,6 +30,7 @@ import org.apache.lucene.search.spell.TermFreqIterator;
public final class TermFreqArrayIterator implements TermFreqIterator { public final class TermFreqArrayIterator implements TermFreqIterator {
private final Iterator<TermFreq> i; private final Iterator<TermFreq> i;
private TermFreq current; private TermFreq current;
private final BytesRef spare = new BytesRef();
public TermFreqArrayIterator(Iterator<TermFreq> i) { public TermFreqArrayIterator(Iterator<TermFreq> i) {
this.i = i; this.i = i;
@ -44,14 +47,14 @@ public final class TermFreqArrayIterator implements TermFreqIterator {
public float freq() { public float freq() {
return current.v; return current.v;
} }
public boolean hasNext() {
return i.hasNext();
}
public String next() {
return (current = i.next()).term;
}
public void remove() { throw new UnsupportedOperationException(); } @Override
public BytesRef next() throws IOException {
if (i.hasNext()) {
current = i.next();
spare.copyBytes(current.term);
return spare;
}
return null;
}
} }

View File

@ -0,0 +1,85 @@
package org.apache.lucene.search.suggest;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
public class TestBytesRefList extends LuceneTestCase {
public void testAppend() throws IOException {
BytesRefList list = new BytesRefList();
List<String> stringList = new ArrayList<String>();
int entries = atLeast(500);
BytesRef spare = new BytesRef();
for (int i = 0; i < entries; i++) {
String randomRealisticUnicodeString = _TestUtil
.randomRealisticUnicodeString(random);
spare.copyChars(randomRealisticUnicodeString);
list.append(spare);
stringList.add(randomRealisticUnicodeString);
}
for (int i = 0; i < entries; i++) {
assertNotNull(list.get(spare, i));
assertEquals("entry " + i + " doesn't match", stringList.get(i),
spare.utf8ToString());
}
// check random
for (int i = 0; i < entries; i++) {
int e = random.nextInt(entries);
assertNotNull(list.get(spare, e));
assertEquals("entry " + i + " doesn't match", stringList.get(e),
spare.utf8ToString());
}
for (int i = 0; i < 2; i++) {
BytesRefIterator iterator = list.iterator();
for (String string : stringList) {
assertEquals(string, iterator.next().utf8ToString());
}
}
}
public void testSort() {
BytesRefList list = new BytesRefList();
List<String> stringList = new ArrayList<String>();
int entries = atLeast(500);
BytesRef spare = new BytesRef();
for (int i = 0; i < entries; i++) {
String randomRealisticUnicodeString = _TestUtil.randomRealisticUnicodeString(random);
spare.copyChars(randomRealisticUnicodeString);
list.append(spare);
stringList.add(randomRealisticUnicodeString);
}
Collections.sort(stringList);
int[] sortedOrds = list.sort(BytesRef.getUTF8SortedAsUTF16Comparator());
for (int i = 0; i < entries; i++) {
assertNotNull(list.get(spare, sortedOrds[i]));
assertEquals("entry " + i + " doesn't match", stringList.get(i),
spare.utf8ToString());
}
}
}

View File

@ -40,7 +40,7 @@ public class FSTCompletionTest extends LuceneTestCase {
FSTCompletionBuilder builder = new FSTCompletionBuilder(); FSTCompletionBuilder builder = new FSTCompletionBuilder();
for (TermFreq tf : evalKeys()) { for (TermFreq tf : evalKeys()) {
builder.add(new BytesRef(tf.term), (int) tf.v); builder.add(tf.term, (int) tf.v);
} }
completion = builder.build(); completion = builder.build();
completionAlphabetical = new FSTCompletion(completion.getFST(), false, true); completionAlphabetical = new FSTCompletion(completion.getFST(), false, true);
@ -167,7 +167,7 @@ public class FSTCompletionTest extends LuceneTestCase {
// are. // are.
Float previous = null; Float previous = null;
for (TermFreq tf : keys) { for (TermFreq tf : keys) {
Float current = lookup.get(tf.term); Float current = lookup.get(tf.term.utf8ToString());
if (previous != null) { if (previous != null) {
assertEquals(previous, current); assertEquals(previous, current);
} }
@ -183,8 +183,8 @@ public class FSTCompletionTest extends LuceneTestCase {
lookup.build(new TermFreqArrayIterator(input)); lookup.build(new TermFreqArrayIterator(input));
for (TermFreq tf : input) { for (TermFreq tf : input) {
assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null); assertTrue("Not found: " + tf.term, lookup.get(tf.term.utf8ToString()) != null);
assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key); assertEquals(tf.term, lookup.lookup(tf.term.utf8ToString(), true, 1).get(0).key);
} }
List<LookupResult> result = lookup.lookup("wit", true, 5); List<LookupResult> result = lookup.lookup("wit", true, 5);
@ -211,7 +211,7 @@ public class FSTCompletionTest extends LuceneTestCase {
lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()]))); lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()])));
for (TermFreq tf : freqs) { for (TermFreq tf : freqs) {
final String term = tf.term; final String term = tf.term.utf8ToString();
for (int i = 1; i < term.length(); i++) { for (int i = 1; i < term.length(); i++) {
String prefix = term.substring(0, i); String prefix = term.substring(0, i);
for (LookupResult lr : lookup.lookup(prefix, true, 10)) { for (LookupResult lr : lookup.lookup(prefix, true, 10)) {