mirror of https://github.com/apache/lucene.git
Optimized TermDocs.skipTo() and changed scorers to take advantage of it.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150170 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
07829a37a7
commit
6f8347c6fe
30
CHANGES.txt
30
CHANGES.txt
|
@ -2,6 +2,36 @@ Lucene Change Log
|
|||
|
||||
$Id$
|
||||
|
||||
1.4 RC1
|
||||
|
||||
1. Changed the format of the .tis file, so that:
|
||||
|
||||
- it has a format version number, which makes it easier to
|
||||
back-compatibly change file formats in the future.
|
||||
|
||||
- the term count is now stored as a long. This was the one aspect
|
||||
of the Lucene's file formats which limited index size.
|
||||
|
||||
- a few internal index parameters are now stored in the index, so
|
||||
that they can (in theory) now be changed from index to index,
|
||||
although there is not yet an API to do so.
|
||||
|
||||
These changes are back compatible. The new code can read old
|
||||
indexes. But old code will not be able read new indexes. (cutting)
|
||||
|
||||
2. Added an optimized implementation of TermDocs.skipTo(). A skip
|
||||
table is now stored for each term in the .frq file. This only
|
||||
adds a percent or two to overall index size, but can substantially
|
||||
speedup many searches. (cutting)
|
||||
|
||||
3. Restructured the Scorer API and all Scorer implementations to take
|
||||
advantage of an optimized TermDocs.skipTo() implementation. In
|
||||
particular, PhraseQuerys and conjunctive BooleanQuerys are
|
||||
faster when one clause has substantially fewer matches than the
|
||||
others. (A conjunctive BooleanQuery is a BooleanQuery where all
|
||||
clauses are required.) (cutting)
|
||||
|
||||
|
||||
1.3 final
|
||||
|
||||
1. Added catch of BooleanQuery$TooManyClauses in QueryParser to
|
||||
|
|
|
@ -291,7 +291,7 @@ final class DocumentWriter {
|
|||
Posting posting = postings[i];
|
||||
|
||||
// add an entry to the dictionary with pointers to prox and freq files
|
||||
ti.set(1, freq.getFilePointer(), prox.getFilePointer());
|
||||
ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
|
||||
tis.add(posting.term, ti);
|
||||
|
||||
// add an entry to the freq file
|
||||
|
|
|
@ -62,6 +62,7 @@ import java.io.IOException;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.OutputStream;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BitVector;
|
||||
|
||||
final class SegmentMerger {
|
||||
|
@ -246,17 +247,21 @@ final class SegmentMerger {
|
|||
|
||||
int df = appendPostings(smis, n); // append posting data
|
||||
|
||||
long skipPointer = writeSkip();
|
||||
|
||||
if (df > 0) {
|
||||
// add an entry to the dictionary with pointers to prox and freq files
|
||||
termInfo.set(df, freqPointer, proxPointer);
|
||||
termInfo.set(df, freqPointer, proxPointer, (int)(skipPointer-freqPointer));
|
||||
termInfosWriter.add(smis[0].term, termInfo);
|
||||
}
|
||||
}
|
||||
|
||||
private final int appendPostings(SegmentMergeInfo[] smis, int n)
|
||||
throws IOException {
|
||||
final int skipInterval = termInfosWriter.skipInterval;
|
||||
int lastDoc = 0;
|
||||
int df = 0; // number of docs w/ term
|
||||
resetSkip();
|
||||
for (int i = 0; i < n; i++) {
|
||||
SegmentMergeInfo smi = smis[i];
|
||||
TermPositions postings = smi.postings;
|
||||
|
@ -272,6 +277,12 @@ final class SegmentMerger {
|
|||
if (doc < lastDoc)
|
||||
throw new IllegalStateException("docs out of order");
|
||||
|
||||
df++;
|
||||
|
||||
if ((df % skipInterval) == 0) {
|
||||
bufferSkip(lastDoc);
|
||||
}
|
||||
|
||||
int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
|
||||
lastDoc = doc;
|
||||
|
||||
|
@ -289,13 +300,43 @@ final class SegmentMerger {
|
|||
proxOutput.writeVInt(position - lastPosition);
|
||||
lastPosition = position;
|
||||
}
|
||||
|
||||
df++;
|
||||
}
|
||||
}
|
||||
return df;
|
||||
}
|
||||
private final void mergeNorms() throws IOException {
|
||||
|
||||
private RAMOutputStream skipBuffer = new RAMOutputStream();
|
||||
private int lastSkipDoc;
|
||||
private long lastSkipFreqPointer;
|
||||
private long lastSkipProxPointer;
|
||||
|
||||
private void resetSkip() throws IOException {
|
||||
skipBuffer.reset();
|
||||
lastSkipDoc = 0;
|
||||
lastSkipFreqPointer = freqOutput.getFilePointer();
|
||||
lastSkipProxPointer = proxOutput.getFilePointer();
|
||||
}
|
||||
|
||||
private void bufferSkip(int doc) throws IOException {
|
||||
long freqPointer = freqOutput.getFilePointer();
|
||||
long proxPointer = proxOutput.getFilePointer();
|
||||
|
||||
skipBuffer.writeVInt(doc - lastSkipDoc);
|
||||
skipBuffer.writeVInt((int)(freqPointer - lastSkipFreqPointer));
|
||||
skipBuffer.writeVInt((int)(proxPointer - lastSkipProxPointer));
|
||||
|
||||
lastSkipDoc = doc;
|
||||
lastSkipFreqPointer = freqPointer;
|
||||
lastSkipProxPointer = proxPointer;
|
||||
}
|
||||
|
||||
private long writeSkip() throws IOException {
|
||||
long skipPointer = freqOutput.getFilePointer();
|
||||
skipBuffer.writeTo(freqOutput);
|
||||
return skipPointer;
|
||||
}
|
||||
|
||||
private void mergeNorms() throws IOException {
|
||||
for (int i = 0; i < fieldInfos.size(); i++) {
|
||||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||
if (fi.isIndexed) {
|
||||
|
|
|
@ -61,16 +61,27 @@ import org.apache.lucene.store.InputStream;
|
|||
class SegmentTermDocs implements TermDocs {
|
||||
protected SegmentReader parent;
|
||||
private InputStream freqStream;
|
||||
private int freqCount;
|
||||
private int count;
|
||||
private int df;
|
||||
private BitVector deletedDocs;
|
||||
int doc = 0;
|
||||
int freq;
|
||||
|
||||
private int skipInterval;
|
||||
private int skipCount;
|
||||
private InputStream skipStream;
|
||||
private int skipDoc;
|
||||
private long freqPointer;
|
||||
private long proxPointer;
|
||||
private long skipPointer;
|
||||
private boolean haveSkipped;
|
||||
|
||||
SegmentTermDocs(SegmentReader parent)
|
||||
throws IOException {
|
||||
this.parent = parent;
|
||||
this.freqStream = (InputStream)parent.freqStream.clone();
|
||||
this.deletedDocs = parent.deletedDocs;
|
||||
this.skipInterval = parent.tis.getSkipInterval();
|
||||
}
|
||||
|
||||
public void seek(Term term) throws IOException {
|
||||
|
@ -88,12 +99,19 @@ class SegmentTermDocs implements TermDocs {
|
|||
}
|
||||
|
||||
void seek(TermInfo ti) throws IOException {
|
||||
count = 0;
|
||||
if (ti == null) {
|
||||
freqCount = 0;
|
||||
df = 0;
|
||||
} else {
|
||||
freqCount = ti.docFreq;
|
||||
df = ti.docFreq;
|
||||
doc = 0;
|
||||
freqStream.seek(ti.freqPointer);
|
||||
skipDoc = 0;
|
||||
skipCount = 0;
|
||||
freqPointer = ti.freqPointer;
|
||||
proxPointer = ti.proxPointer;
|
||||
skipPointer = freqPointer + ti.skipOffset;
|
||||
freqStream.seek(freqPointer);
|
||||
haveSkipped = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -109,7 +127,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
|
||||
public boolean next() throws IOException {
|
||||
while (true) {
|
||||
if (freqCount == 0)
|
||||
if (count == df)
|
||||
return false;
|
||||
|
||||
int docCode = freqStream.readVInt();
|
||||
|
@ -119,7 +137,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
else
|
||||
freq = freqStream.readVInt(); // else read freq
|
||||
|
||||
freqCount--;
|
||||
count++;
|
||||
|
||||
if (deletedDocs == null || !deletedDocs.get(doc))
|
||||
break;
|
||||
|
@ -131,9 +149,9 @@ class SegmentTermDocs implements TermDocs {
|
|||
/** Optimized implementation. */
|
||||
public int read(final int[] docs, final int[] freqs)
|
||||
throws IOException {
|
||||
final int end = docs.length;
|
||||
final int length = docs.length;
|
||||
int i = 0;
|
||||
while (i < end && freqCount > 0) {
|
||||
while (i < length && count < df) {
|
||||
|
||||
// manually inlined call to next() for speed
|
||||
final int docCode = freqStream.readVInt();
|
||||
|
@ -142,7 +160,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
freq = 1; // freq is one
|
||||
else
|
||||
freq = freqStream.readVInt(); // else read freq
|
||||
freqCount--;
|
||||
count++;
|
||||
|
||||
if (deletedDocs == null || !deletedDocs.get(doc)) {
|
||||
docs[i] = doc;
|
||||
|
@ -153,12 +171,61 @@ class SegmentTermDocs implements TermDocs {
|
|||
return i;
|
||||
}
|
||||
|
||||
/** As yet unoptimized implementation. */
|
||||
/** Overridden by SegmentTermPositions to skip in prox stream. */
|
||||
protected void skipProx(long proxPointer) throws IOException {}
|
||||
|
||||
/** Optimized implementation. */
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
if (df > skipInterval) { // optimized case
|
||||
|
||||
if (skipStream == null)
|
||||
skipStream = (InputStream)freqStream.clone(); // lazily clone
|
||||
|
||||
if (!haveSkipped) { // lazily seek skip stream
|
||||
skipStream.seek(skipPointer);
|
||||
haveSkipped = true;
|
||||
}
|
||||
|
||||
// scan skip data
|
||||
int lastSkipDoc = skipDoc;
|
||||
long lastFreqPointer = freqStream.getFilePointer();
|
||||
long lastProxPointer = -1;
|
||||
int numSkipped = -1 -(count % skipInterval);
|
||||
|
||||
while (target > skipDoc) {
|
||||
lastSkipDoc = skipDoc;
|
||||
lastFreqPointer = freqPointer;
|
||||
lastProxPointer = proxPointer;
|
||||
if (skipDoc >= doc)
|
||||
numSkipped += skipInterval;
|
||||
|
||||
if ((count + numSkipped + skipInterval) > df)
|
||||
break; // no more skips
|
||||
|
||||
skipDoc += skipStream.readVInt();
|
||||
freqPointer += skipStream.readVInt();
|
||||
proxPointer += skipStream.readVInt();
|
||||
|
||||
skipCount++;
|
||||
}
|
||||
|
||||
// if we found something to skip, then skip it
|
||||
if (lastFreqPointer > freqStream.getFilePointer()) {
|
||||
freqStream.seek(lastFreqPointer);
|
||||
skipProx(lastProxPointer);
|
||||
|
||||
doc = lastSkipDoc;
|
||||
count += numSkipped;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// done skipping, now just scan
|
||||
do {
|
||||
if (!next())
|
||||
return false;
|
||||
} while (target > doc);
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -60,14 +60,17 @@ import org.apache.lucene.store.InputStream;
|
|||
final class SegmentTermEnum extends TermEnum implements Cloneable {
|
||||
private InputStream input;
|
||||
private FieldInfos fieldInfos;
|
||||
int size;
|
||||
int position = -1;
|
||||
long size;
|
||||
long position = -1;
|
||||
|
||||
private Term term = new Term("", "");
|
||||
private TermInfo termInfo = new TermInfo();
|
||||
|
||||
boolean isIndex = false;
|
||||
private int format;
|
||||
private boolean isIndex = false;
|
||||
long indexPointer = 0;
|
||||
int indexInterval;
|
||||
int skipInterval;
|
||||
Term prev;
|
||||
|
||||
private char[] buffer = {};
|
||||
|
@ -76,8 +79,34 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
throws IOException {
|
||||
input = i;
|
||||
fieldInfos = fis;
|
||||
size = input.readInt();
|
||||
isIndex = isi;
|
||||
|
||||
int firstInt = input.readInt();
|
||||
if (firstInt >= 0) {
|
||||
// original-format file, without explicit format version number
|
||||
format = 0;
|
||||
size = firstInt;
|
||||
|
||||
// back-compatible settings
|
||||
indexInterval = 128;
|
||||
skipInterval = Integer.MAX_VALUE;
|
||||
|
||||
} else {
|
||||
// we have a format version number
|
||||
format = firstInt;
|
||||
|
||||
// check that it is a format we can understand
|
||||
if (format < TermInfosWriter.FORMAT)
|
||||
throw new IOException("Unknown format version:" + format);
|
||||
|
||||
size = input.readLong(); // read the size
|
||||
|
||||
if (!isIndex) {
|
||||
indexInterval = input.readInt();
|
||||
skipInterval = input.readInt();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected Object clone() {
|
||||
|
@ -117,6 +146,12 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
termInfo.freqPointer += input.readVLong(); // read freq pointer
|
||||
termInfo.proxPointer += input.readVLong(); // read prox pointer
|
||||
|
||||
if (!isIndex) {
|
||||
if (termInfo.docFreq > skipInterval) {
|
||||
termInfo.skipOffset = input.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
if (isIndex)
|
||||
indexPointer += input.readVLong(); // read index pointer
|
||||
|
||||
|
|
|
@ -109,4 +109,11 @@ extends SegmentTermDocs implements TermPositions {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
/** Called by super.skipTo(). */
|
||||
protected void skipProx(long proxPointer) throws IOException {
|
||||
proxStream.seek(proxPointer);
|
||||
proxCount = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -62,6 +62,7 @@ final class TermInfo {
|
|||
|
||||
long freqPointer = 0;
|
||||
long proxPointer = 0;
|
||||
int skipOffset;
|
||||
|
||||
TermInfo() {}
|
||||
|
||||
|
@ -75,17 +76,21 @@ final class TermInfo {
|
|||
docFreq = ti.docFreq;
|
||||
freqPointer = ti.freqPointer;
|
||||
proxPointer = ti.proxPointer;
|
||||
skipOffset = ti.skipOffset;
|
||||
}
|
||||
|
||||
final void set(int df, long fp, long pp) {
|
||||
docFreq = df;
|
||||
freqPointer = fp;
|
||||
proxPointer = pp;
|
||||
final void set(int docFreq,
|
||||
long freqPointer, long proxPointer, int skipOffset) {
|
||||
this.docFreq = docFreq;
|
||||
this.freqPointer = freqPointer;
|
||||
this.proxPointer = proxPointer;
|
||||
this.skipOffset = skipOffset;
|
||||
}
|
||||
|
||||
final void set(TermInfo ti) {
|
||||
docFreq = ti.docFreq;
|
||||
freqPointer = ti.freqPointer;
|
||||
proxPointer = ti.proxPointer;
|
||||
skipOffset = ti.skipOffset;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -68,7 +68,7 @@ final class TermInfosReader {
|
|||
private FieldInfos fieldInfos;
|
||||
|
||||
private SegmentTermEnum enumerator;
|
||||
private int size;
|
||||
private long size;
|
||||
|
||||
TermInfosReader(Directory dir, String seg, FieldInfos fis)
|
||||
throws IOException {
|
||||
|
@ -82,13 +82,17 @@ final class TermInfosReader {
|
|||
readIndex();
|
||||
}
|
||||
|
||||
public int getSkipInterval() {
|
||||
return enumerator.skipInterval;
|
||||
}
|
||||
|
||||
final void close() throws IOException {
|
||||
if (enumerator != null)
|
||||
enumerator.close();
|
||||
}
|
||||
|
||||
/** Returns the number of term/value pairs in the set. */
|
||||
final int size() {
|
||||
final long size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
|
@ -101,7 +105,7 @@ final class TermInfosReader {
|
|||
new SegmentTermEnum(directory.openFile(segment + ".tii"),
|
||||
fieldInfos, true);
|
||||
try {
|
||||
int indexSize = indexEnum.size;
|
||||
int indexSize = (int)indexEnum.size;
|
||||
|
||||
indexTerms = new Term[indexSize];
|
||||
indexInfos = new TermInfo[indexSize];
|
||||
|
@ -137,7 +141,7 @@ final class TermInfosReader {
|
|||
|
||||
private final void seekEnum(int indexOffset) throws IOException {
|
||||
enumerator.seek(indexPointers[indexOffset],
|
||||
(indexOffset * TermInfosWriter.INDEX_INTERVAL) - 1,
|
||||
(indexOffset * enumerator.indexInterval) - 1,
|
||||
indexTerms[indexOffset], indexInfos[indexOffset]);
|
||||
}
|
||||
|
||||
|
@ -146,10 +150,10 @@ final class TermInfosReader {
|
|||
if (size == 0) return null;
|
||||
|
||||
// optimize sequential access: first try scanning cached enumerator w/o seeking
|
||||
if (enumerator.term() != null // term is at or past current
|
||||
if (enumerator.term() != null // term is at or past current
|
||||
&& ((enumerator.prev != null && term.compareTo(enumerator.prev) > 0)
|
||||
|| term.compareTo(enumerator.term()) >= 0)) {
|
||||
int enumOffset = (enumerator.position/TermInfosWriter.INDEX_INTERVAL)+1;
|
||||
int enumOffset = (int)(enumerator.position/enumerator.indexInterval)+1;
|
||||
if (indexTerms.length == enumOffset // but before end of block
|
||||
|| term.compareTo(indexTerms[enumOffset]) < 0)
|
||||
return scanEnum(term); // no need to seek
|
||||
|
@ -174,10 +178,10 @@ final class TermInfosReader {
|
|||
if (size == 0) return null;
|
||||
|
||||
if (enumerator != null && enumerator.term() != null && position >= enumerator.position &&
|
||||
position < (enumerator.position + TermInfosWriter.INDEX_INTERVAL))
|
||||
position < (enumerator.position + enumerator.indexInterval))
|
||||
return scanEnum(position); // can avoid seek
|
||||
|
||||
seekEnum(position / TermInfosWriter.INDEX_INTERVAL); // must seek
|
||||
seekEnum(position / enumerator.indexInterval); // must seek
|
||||
return scanEnum(position);
|
||||
}
|
||||
|
||||
|
@ -190,7 +194,7 @@ final class TermInfosReader {
|
|||
}
|
||||
|
||||
/** Returns the position of a Term in the set or -1. */
|
||||
final synchronized int getPosition(Term term) throws IOException {
|
||||
final synchronized long getPosition(Term term) throws IOException {
|
||||
if (size == 0) return -1;
|
||||
|
||||
int indexOffset = getIndexOffset(term);
|
||||
|
|
|
@ -62,13 +62,36 @@ import org.apache.lucene.store.Directory;
|
|||
Directory. A TermInfos can be written once, in order. */
|
||||
|
||||
final class TermInfosWriter {
|
||||
/** The file format version, a negative number. */
|
||||
public static final int FORMAT = -1;
|
||||
|
||||
private FieldInfos fieldInfos;
|
||||
private OutputStream output;
|
||||
private Term lastTerm = new Term("", "");
|
||||
private TermInfo lastTi = new TermInfo();
|
||||
private int size = 0;
|
||||
|
||||
static final int INDEX_INTERVAL = 128;
|
||||
// TODO: the default values for these two parameters should be settable from
|
||||
// IndexWriter. However, once that's done, folks will start setting them to
|
||||
// ridiculous values and complaining that things don't work well, as with
|
||||
// mergeFactor. So, let's wait until a number of folks find that alternate
|
||||
// values work better. Note that both of these values are stored in the
|
||||
// segment, so that it's safe to change these w/o rebuilding all indexes.
|
||||
|
||||
/** Expert: The fraction of terms in the "dictionary" which should be stored
|
||||
* in RAM. Smaller values use more memory, but make searching slightly
|
||||
* faster, while larger values use less memory and make searching slightly
|
||||
* slower. Searching is typically not dominated by dictionary lookup, so
|
||||
* tweaking this is rarely useful.*/
|
||||
int indexInterval = 128;
|
||||
|
||||
/** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
|
||||
* used to accellerate {@link TermDocs#skipTo(int)}. Larger values result in
|
||||
* smaller indexes, greater acceleration, but fewer accelerable cases, while
|
||||
* smaller values result in bigger indexes, less acceleration and more
|
||||
* accelerable cases. More detailed experiments would be useful here. */
|
||||
int skipInterval = 16;
|
||||
|
||||
private long lastIndexPointer = 0;
|
||||
private boolean isIndex = false;
|
||||
|
||||
|
@ -91,7 +114,12 @@ final class TermInfosWriter {
|
|||
fieldInfos = fis;
|
||||
isIndex = isi;
|
||||
output = directory.createFile(segment + (isIndex ? ".tii" : ".tis"));
|
||||
output.writeInt(0); // leave space for size
|
||||
output.writeInt(FORMAT); // write format
|
||||
output.writeLong(0); // leave space for size
|
||||
if (!isIndex) {
|
||||
output.writeInt(indexInterval); // write indexInterval
|
||||
output.writeInt(skipInterval); // write skipInterval
|
||||
}
|
||||
}
|
||||
|
||||
/** Adds a new <Term, TermInfo> pair to the set.
|
||||
|
@ -106,7 +134,7 @@ final class TermInfosWriter {
|
|||
if (ti.proxPointer < lastTi.proxPointer)
|
||||
throw new IOException("proxPointer out of order");
|
||||
|
||||
if (!isIndex && size % INDEX_INTERVAL == 0)
|
||||
if (!isIndex && size % indexInterval == 0)
|
||||
other.add(lastTerm, lastTi); // add an index term
|
||||
|
||||
writeTerm(term); // write term
|
||||
|
@ -114,6 +142,12 @@ final class TermInfosWriter {
|
|||
output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
|
||||
output.writeVLong(ti.proxPointer - lastTi.proxPointer);
|
||||
|
||||
if (!isIndex) {
|
||||
if (ti.docFreq > skipInterval) {
|
||||
output.writeVInt(ti.skipOffset);
|
||||
}
|
||||
}
|
||||
|
||||
if (isIndex) {
|
||||
output.writeVLong(other.output.getFilePointer() - lastIndexPointer);
|
||||
lastIndexPointer = other.output.getFilePointer(); // write pointer
|
||||
|
@ -149,8 +183,8 @@ final class TermInfosWriter {
|
|||
|
||||
/** Called to complete TermInfos creation. */
|
||||
final void close() throws IOException {
|
||||
output.seek(0); // write size at start
|
||||
output.writeInt(size);
|
||||
output.seek(4); // write size after format
|
||||
output.writeLong(size);
|
||||
output.close();
|
||||
|
||||
if (!isIndex)
|
||||
|
|
|
@ -158,6 +158,37 @@ public class BooleanQuery extends Query {
|
|||
}
|
||||
|
||||
public Scorer scorer(IndexReader reader) throws IOException {
|
||||
// First see if the (faster) ConjunctionScorer will work. This can be
|
||||
// used when all clauses are required. Also, at this point a
|
||||
// BooleanScorer cannot be embedded in a ConjunctionScorer, as the hits
|
||||
// from a BooleanScorer are not always sorted by document number (sigh)
|
||||
// and hence BooleanScorer cannot implement skipTo() correctly, which is
|
||||
// required by ConjunctionScorer.
|
||||
boolean allRequired = true;
|
||||
boolean noneBoolean = true;
|
||||
for (int i = 0 ; i < weights.size(); i++) {
|
||||
BooleanClause c = (BooleanClause)clauses.elementAt(i);
|
||||
if (!c.required)
|
||||
allRequired = false;
|
||||
if (c.query instanceof BooleanQuery)
|
||||
noneBoolean = false;
|
||||
}
|
||||
|
||||
if (allRequired && noneBoolean) { // ConjunctionScorer is okay
|
||||
ConjunctionScorer result =
|
||||
new ConjunctionScorer(searcher.getSimilarity());
|
||||
for (int i = 0 ; i < weights.size(); i++) {
|
||||
BooleanClause c = (BooleanClause)clauses.elementAt(i);
|
||||
Weight w = (Weight)weights.elementAt(i);
|
||||
Scorer subScorer = w.scorer(reader);
|
||||
if (subScorer == null)
|
||||
return null;
|
||||
result.add(subScorer);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Use good-old BooleanScorer instead.
|
||||
BooleanScorer result = new BooleanScorer(searcher.getSimilarity());
|
||||
|
||||
for (int i = 0 ; i < weights.size(); i++) {
|
||||
|
|
|
@ -76,14 +76,17 @@ final class BooleanScorer extends Scorer {
|
|||
|
||||
static final class SubScorer {
|
||||
public Scorer scorer;
|
||||
public boolean done;
|
||||
public boolean required = false;
|
||||
public boolean prohibited = false;
|
||||
public HitCollector collector;
|
||||
public SubScorer next;
|
||||
|
||||
public SubScorer(Scorer scorer, boolean required, boolean prohibited,
|
||||
HitCollector collector, SubScorer next) {
|
||||
HitCollector collector, SubScorer next)
|
||||
throws IOException {
|
||||
this.scorer = scorer;
|
||||
this.done = !scorer.next();
|
||||
this.required = required;
|
||||
this.prohibited = prohibited;
|
||||
this.collector = collector;
|
||||
|
@ -91,7 +94,8 @@ final class BooleanScorer extends Scorer {
|
|||
}
|
||||
}
|
||||
|
||||
final void add(Scorer scorer, boolean required, boolean prohibited) {
|
||||
final void add(Scorer scorer, boolean required, boolean prohibited)
|
||||
throws IOException {
|
||||
int mask = 0;
|
||||
if (required || prohibited) {
|
||||
if (nextMask == 0)
|
||||
|
@ -120,17 +124,45 @@ final class BooleanScorer extends Scorer {
|
|||
coordFactors[i] = getSimilarity().coord(i, maxCoord-1);
|
||||
}
|
||||
|
||||
public final void score(HitCollector results, int maxDoc)
|
||||
throws IOException {
|
||||
private int end;
|
||||
private Bucket current;
|
||||
|
||||
public int doc() { return current.doc; }
|
||||
|
||||
public boolean next() throws IOException {
|
||||
boolean more = false;
|
||||
do {
|
||||
while (bucketTable.first != null) { // more queued
|
||||
current = bucketTable.first;
|
||||
bucketTable.first = current.next; // pop the queue
|
||||
|
||||
// check prohibited & required
|
||||
if ((current.bits & prohibitedMask) == 0 &&
|
||||
(current.bits & requiredMask) == requiredMask) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// refill the queue
|
||||
end += BucketTable.SIZE;
|
||||
for (SubScorer sub = scorers; sub != null; sub = sub.next) {
|
||||
Scorer scorer = sub.scorer;
|
||||
while (!sub.done && scorer.doc() < end) {
|
||||
sub.collector.collect(scorer.doc(), scorer.score());
|
||||
sub.done = !scorer.next();
|
||||
}
|
||||
if (!sub.done) {
|
||||
more = true;
|
||||
}
|
||||
}
|
||||
} while (bucketTable.first != null | more);
|
||||
return false;
|
||||
}
|
||||
|
||||
public float score() throws IOException {
|
||||
if (coordFactors == null)
|
||||
computeCoordFactors();
|
||||
|
||||
while (currentDoc < maxDoc) {
|
||||
currentDoc = Math.min(currentDoc+BucketTable.SIZE, maxDoc);
|
||||
for (SubScorer t = scorers; t != null; t = t.next)
|
||||
t.scorer.score(t.collector, currentDoc);
|
||||
bucketTable.collectHits(results);
|
||||
}
|
||||
return current.score * coordFactors[current.coord];
|
||||
}
|
||||
|
||||
static final class Bucket {
|
||||
|
@ -196,7 +228,7 @@ final class BooleanScorer extends Scorer {
|
|||
bucket.score = score; // initialize score
|
||||
bucket.bits = mask; // initialize mask
|
||||
bucket.coord = 1; // initialize coord
|
||||
|
||||
|
||||
bucket.next = table.first; // push onto valid list
|
||||
table.first = bucket;
|
||||
} else { // valid bucket
|
||||
|
@ -207,6 +239,10 @@ final class BooleanScorer extends Scorer {
|
|||
}
|
||||
}
|
||||
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public Explanation explain(int doc) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,155 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2004 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import org.apache.lucene.index.*;
|
||||
|
||||
/** Scorer for conjunctions, sets of queries, all of which are required. */
|
||||
final class ConjunctionScorer extends Scorer {
|
||||
private LinkedList scorers = new LinkedList();
|
||||
private boolean firstTime = true;
|
||||
private boolean more = true;
|
||||
private float coord;
|
||||
|
||||
public ConjunctionScorer(Similarity similarity) {
|
||||
super(similarity);
|
||||
}
|
||||
|
||||
final void add(Scorer scorer) throws IOException {
|
||||
scorers.addLast(scorer);
|
||||
}
|
||||
|
||||
private Scorer first() { return (Scorer)scorers.getFirst(); }
|
||||
private Scorer last() { return (Scorer)scorers.getLast(); }
|
||||
|
||||
public int doc() { return first().doc(); }
|
||||
|
||||
public boolean next() throws IOException {
|
||||
if (firstTime) {
|
||||
init();
|
||||
} else if (more) {
|
||||
more = last().next(); // trigger further scanning
|
||||
}
|
||||
|
||||
while (more && first().doc() < last().doc()) { // find doc w/ all clauses
|
||||
more = first().skipTo(last().doc()); // skip first upto last
|
||||
scorers.addLast(scorers.removeFirst()); // move first to last
|
||||
}
|
||||
|
||||
return more; // found a doc with all clauses
|
||||
}
|
||||
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
Iterator i = scorers.iterator();
|
||||
while (more && i.hasNext()) {
|
||||
more = ((Scorer)i.next()).skipTo(target);
|
||||
}
|
||||
if (more)
|
||||
sortScorers(); // re-sort scorers
|
||||
return more;
|
||||
}
|
||||
|
||||
public float score() throws IOException {
|
||||
float score = 0.0f; // sum scores
|
||||
Iterator i = scorers.iterator();
|
||||
while (i.hasNext())
|
||||
score += ((Scorer)i.next()).score();
|
||||
score *= coord;
|
||||
return score;
|
||||
}
|
||||
|
||||
private void init() throws IOException {
|
||||
more = scorers.size() > 0;
|
||||
|
||||
// compute coord factor
|
||||
coord = getSimilarity().coord(scorers.size(), scorers.size());
|
||||
|
||||
// move each scorer to its first entry
|
||||
Iterator i = scorers.iterator();
|
||||
while (more && i.hasNext()) {
|
||||
more = ((Scorer)i.next()).next();
|
||||
}
|
||||
if (more)
|
||||
sortScorers(); // initial sort of list
|
||||
|
||||
firstTime = false;
|
||||
}
|
||||
|
||||
private void sortScorers() throws IOException {
|
||||
// move scorers to an array
|
||||
Scorer[] array = (Scorer[])scorers.toArray(new Scorer[scorers.size()]);
|
||||
scorers.clear(); // empty the list
|
||||
|
||||
Arrays.sort(array, new Comparator() { // sort the array
|
||||
public int compare(Object o1, Object o2) {
|
||||
return ((Scorer)o1).doc() - ((Scorer)o2).doc();
|
||||
}
|
||||
public boolean equals(Object o1, Object o2) {
|
||||
return ((Scorer)o1).doc() == ((Scorer)o2).doc();
|
||||
}
|
||||
});
|
||||
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
scorers.addLast(array[i]); // re-build list, now sorted
|
||||
}
|
||||
}
|
||||
|
||||
public Explanation explain(int doc) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
}
|
|
@ -140,7 +140,7 @@ public class IndexSearcher extends Searcher {
|
|||
hq.insert(new ScoreDoc(doc, score));
|
||||
}
|
||||
}
|
||||
}, reader.maxDoc());
|
||||
});
|
||||
|
||||
ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
|
||||
for (int i = hq.size()-1; i >= 0; i--) // put docs in array
|
||||
|
@ -180,7 +180,7 @@ public class IndexSearcher extends Searcher {
|
|||
Scorer scorer = query.weight(this).scorer(reader);
|
||||
if (scorer == null)
|
||||
return;
|
||||
scorer.score(collector, reader.maxDoc());
|
||||
scorer.score(collector);
|
||||
}
|
||||
|
||||
public Query rewrite(Query original) throws IOException {
|
||||
|
|
|
@ -68,19 +68,31 @@ final class PhrasePositions {
|
|||
PhrasePositions(TermPositions t, int o) throws IOException {
|
||||
tp = t;
|
||||
offset = o;
|
||||
next();
|
||||
}
|
||||
|
||||
final void next() throws IOException { // increments to next doc
|
||||
final boolean next() throws IOException { // increments to next doc
|
||||
if (!tp.next()) {
|
||||
tp.close(); // close stream
|
||||
doc = Integer.MAX_VALUE; // sentinel value
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
doc = tp.doc();
|
||||
position = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
final boolean skipTo(int target) throws IOException {
|
||||
if (!tp.skipTo(target)) {
|
||||
tp.close(); // close stream
|
||||
doc = Integer.MAX_VALUE; // sentinel value
|
||||
return false;
|
||||
}
|
||||
doc = tp.doc();
|
||||
position = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
final void firstPosition() throws IOException {
|
||||
count = tp.freq(); // read first pos
|
||||
nextPosition();
|
||||
|
|
|
@ -60,89 +60,127 @@ import org.apache.lucene.util.*;
|
|||
import org.apache.lucene.index.*;
|
||||
|
||||
abstract class PhraseScorer extends Scorer {
|
||||
private Weight weight;
|
||||
protected byte[] norms;
|
||||
protected float value;
|
||||
private Weight weight;
|
||||
protected byte[] norms;
|
||||
protected float value;
|
||||
|
||||
protected PhraseQueue pq;
|
||||
protected PhrasePositions first, last;
|
||||
private boolean firstTime = true;
|
||||
private boolean more = true;
|
||||
protected PhraseQueue pq;
|
||||
protected PhrasePositions first, last;
|
||||
|
||||
private float freq;
|
||||
private float freq;
|
||||
|
||||
PhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity,
|
||||
byte[] norms) throws IOException {
|
||||
super(similarity);
|
||||
this.norms = norms;
|
||||
this.weight = weight;
|
||||
this.value = weight.getValue();
|
||||
PhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity,
|
||||
byte[] norms) throws IOException {
|
||||
super(similarity);
|
||||
this.norms = norms;
|
||||
this.weight = weight;
|
||||
this.value = weight.getValue();
|
||||
|
||||
// use PQ to build a sorted list of PhrasePositions
|
||||
pq = new PhraseQueue(tps.length);
|
||||
for (int i = 0; i < tps.length; i++) {
|
||||
pq.put(new PhrasePositions(tps[i], i));
|
||||
}
|
||||
pqToList();
|
||||
// convert tps to a list
|
||||
for (int i = 0; i < tps.length; i++) {
|
||||
PhrasePositions pp = new PhrasePositions(tps[i], i);
|
||||
if (last != null) { // add next to end of list
|
||||
last.next = pp;
|
||||
} else
|
||||
first = pp;
|
||||
last = pp;
|
||||
}
|
||||
|
||||
public final void score(HitCollector results, int end) throws IOException {
|
||||
Similarity similarity = getSimilarity();
|
||||
while (last.doc < end) { // find doc w/ all the terms
|
||||
while (first.doc < last.doc) { // scan forward in first
|
||||
do {
|
||||
first.next();
|
||||
} while (first.doc < last.doc);
|
||||
firstToLast();
|
||||
if (last.doc >= end)
|
||||
return;
|
||||
}
|
||||
pq = new PhraseQueue(tps.length); // construct empty pq
|
||||
|
||||
// found doc with all terms
|
||||
freq = phraseFreq(); // check for phrase
|
||||
}
|
||||
|
||||
if (freq > 0.0) {
|
||||
float score = similarity.tf(freq) * value; // compute score
|
||||
score *= Similarity.decodeNorm(norms[first.doc]); // normalize
|
||||
results.collect(first.doc, score); // add to results
|
||||
}
|
||||
last.next(); // resume scanning
|
||||
}
|
||||
public int doc() { return first.doc; }
|
||||
|
||||
public boolean next() throws IOException {
|
||||
if (firstTime) {
|
||||
sort();
|
||||
firstTime = false;
|
||||
} else if (more) {
|
||||
more = last.next(); // trigger further scanning
|
||||
}
|
||||
|
||||
protected abstract float phraseFreq() throws IOException;
|
||||
while (more) {
|
||||
while (more && first.doc < last.doc) { // find doc w/ all the terms
|
||||
more = first.skipTo(last.doc); // skip first upto last
|
||||
firstToLast(); // and move it to the end
|
||||
}
|
||||
|
||||
protected final void pqToList() {
|
||||
last = first = null;
|
||||
while (pq.top() != null) {
|
||||
PhrasePositions pp = (PhrasePositions) pq.pop();
|
||||
if (last != null) { // add next to end of list
|
||||
last.next = pp;
|
||||
} else
|
||||
first = pp;
|
||||
last = pp;
|
||||
pp.next = null;
|
||||
}
|
||||
if (more) {
|
||||
// found a doc with all of the terms
|
||||
freq = phraseFreq(); // check for phrase
|
||||
if (freq == 0.0f) // no match
|
||||
more = last.next(); // trigger further scanning
|
||||
else
|
||||
return true; // found a match
|
||||
}
|
||||
}
|
||||
return false; // no more matches
|
||||
}
|
||||
|
||||
protected final void firstToLast() {
|
||||
last.next = first; // move first to end of list
|
||||
last = first;
|
||||
first = first.next;
|
||||
last.next = null;
|
||||
public float score() throws IOException {
|
||||
//System.out.println("scoring " + first.doc);
|
||||
float raw = getSimilarity().tf(freq) * value; // raw score
|
||||
return raw * Similarity.decodeNorm(norms[first.doc]); // normalize
|
||||
}
|
||||
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
for (PhrasePositions pp = first; more && pp != null; pp = pp.next) {
|
||||
more = pp.skipTo(target);
|
||||
}
|
||||
if (more)
|
||||
sort(); // re-sort
|
||||
return more;
|
||||
}
|
||||
|
||||
public Explanation explain(final int doc) throws IOException {
|
||||
Explanation tfExplanation = new Explanation();
|
||||
|
||||
score(new HitCollector() {
|
||||
public final void collect(int d, float score) {
|
||||
}
|
||||
}, doc + 1);
|
||||
protected abstract float phraseFreq() throws IOException;
|
||||
|
||||
float phraseFreq = (first.doc == doc) ? freq : 0.0f;
|
||||
tfExplanation.setValue(getSimilarity().tf(phraseFreq));
|
||||
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
|
||||
|
||||
return tfExplanation;
|
||||
private void sort() throws IOException {
|
||||
pq.clear();
|
||||
for (PhrasePositions pp = first; more && pp != null; pp = pp.next) {
|
||||
more = pp.next();
|
||||
if (more) {
|
||||
pq.put(pp);
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
pqToList();
|
||||
}
|
||||
|
||||
protected final void pqToList() {
|
||||
last = first = null;
|
||||
while (pq.top() != null) {
|
||||
PhrasePositions pp = (PhrasePositions) pq.pop();
|
||||
if (last != null) { // add next to end of list
|
||||
last.next = pp;
|
||||
} else
|
||||
first = pp;
|
||||
last = pp;
|
||||
pp.next = null;
|
||||
}
|
||||
}
|
||||
|
||||
protected final void firstToLast() {
|
||||
last.next = first; // move first to end of list
|
||||
last = first;
|
||||
first = first.next;
|
||||
last.next = null;
|
||||
}
|
||||
|
||||
public Explanation explain(final int doc) throws IOException {
|
||||
Explanation tfExplanation = new Explanation();
|
||||
|
||||
while (next() && doc() < doc) {}
|
||||
|
||||
float phraseFreq = (doc() == doc) ? freq : 0.0f;
|
||||
tfExplanation.setValue(getSimilarity().tf(phraseFreq));
|
||||
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
|
||||
|
||||
return tfExplanation;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -70,11 +70,39 @@ public abstract class Scorer {
|
|||
return this.similarity;
|
||||
}
|
||||
|
||||
/** Scores hits and passes them to a collector. Stops at the last document
|
||||
* before <code>maxDoc</code>. If called repeatedly, will restart at point
|
||||
* where it last left off.
|
||||
/** Scores all documents and passes them to a collector. */
|
||||
public void score(HitCollector hc) throws IOException {
|
||||
while (next()) {
|
||||
hc.collect(doc(), score());
|
||||
}
|
||||
}
|
||||
|
||||
/** Advance to the next document matching the query. Returns true iff there
|
||||
* is another match. */
|
||||
public abstract boolean next() throws IOException;
|
||||
|
||||
/** Returns the current document number. Initially invalid, until {@link
|
||||
* #next()} is called the first time. */
|
||||
public abstract int doc();
|
||||
|
||||
/** Returns the score of the current document. Initially invalid, until
|
||||
* {@link #next()} is called the first time. */
|
||||
public abstract float score() throws IOException;
|
||||
|
||||
/** Skips to the first match beyond the current whose document number is
|
||||
* greater than or equal to <i>target</i>. <p>Returns true iff there is such
|
||||
* a match. <p>Behaves as if written: <pre>
|
||||
* boolean skipTo(int target) {
|
||||
* do {
|
||||
* if (!next())
|
||||
* return false;
|
||||
* } while (target > doc());
|
||||
* return true;
|
||||
* }
|
||||
* </pre>
|
||||
* Most implementations are considerably more efficient than that.
|
||||
*/
|
||||
public abstract void score(HitCollector hc, int maxDoc) throws IOException;
|
||||
public abstract boolean skipTo(int target) throws IOException;
|
||||
|
||||
/** Returns an explanation of the score for <code>doc</code>. */
|
||||
public abstract Explanation explain(int doc) throws IOException;
|
||||
|
|
|
@ -83,44 +83,56 @@ final class TermScorer extends Scorer {
|
|||
|
||||
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
|
||||
scoreCache[i] = getSimilarity().tf(i) * weightValue;
|
||||
|
||||
pointerMax = termDocs.read(docs, freqs); // fill buffers
|
||||
|
||||
if (pointerMax != 0)
|
||||
doc = docs[0];
|
||||
else {
|
||||
termDocs.close(); // close stream
|
||||
doc = Integer.MAX_VALUE; // set to sentinel value
|
||||
}
|
||||
}
|
||||
|
||||
public final void score(HitCollector c, final int end) throws IOException {
|
||||
int d = doc; // cache doc in local
|
||||
Similarity similarity = getSimilarity(); // cache sim in local
|
||||
while (d < end) { // for docs in window
|
||||
final int f = freqs[pointer];
|
||||
float score = // compute tf(f)*weight
|
||||
f < SCORE_CACHE_SIZE // check cache
|
||||
? scoreCache[f] // cache hit
|
||||
: similarity.tf(f)*weightValue; // cache miss
|
||||
public int doc() { return doc; }
|
||||
|
||||
score *= Similarity.decodeNorm(norms[d]); // normalize for field
|
||||
public boolean next() throws IOException {
|
||||
pointer++;
|
||||
if (pointer >= pointerMax) {
|
||||
pointerMax = termDocs.read(docs, freqs); // refill buffer
|
||||
if (pointerMax != 0) {
|
||||
pointer = 0;
|
||||
} else {
|
||||
termDocs.close(); // close stream
|
||||
doc = Integer.MAX_VALUE; // set to sentinel value
|
||||
return false;
|
||||
}
|
||||
}
|
||||
doc = docs[pointer];
|
||||
return true;
|
||||
}
|
||||
|
||||
c.collect(d, score); // collect score
|
||||
public float score() throws IOException {
|
||||
int f = freqs[pointer];
|
||||
float raw = // compute tf(f)*weight
|
||||
f < SCORE_CACHE_SIZE // check cache
|
||||
? scoreCache[f] // cache hit
|
||||
: getSimilarity().tf(f)*weightValue; // cache miss
|
||||
|
||||
if (++pointer == pointerMax) {
|
||||
pointerMax = termDocs.read(docs, freqs); // refill buffers
|
||||
if (pointerMax != 0) {
|
||||
pointer = 0;
|
||||
} else {
|
||||
termDocs.close(); // close stream
|
||||
doc = Integer.MAX_VALUE; // set to sentinel value
|
||||
return;
|
||||
}
|
||||
}
|
||||
d = docs[pointer];
|
||||
return raw * Similarity.decodeNorm(norms[doc]); // normalize for field
|
||||
}
|
||||
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
// first scan in cache
|
||||
for (pointer++; pointer < pointerMax; pointer++) {
|
||||
if (!(target > docs[pointer])) {
|
||||
doc = docs[pointer];
|
||||
return true;
|
||||
}
|
||||
}
|
||||
doc = d; // flush cache
|
||||
|
||||
// not found in cache, seek underlying stream
|
||||
boolean result = termDocs.skipTo(target);
|
||||
if (result) {
|
||||
pointerMax = 1;
|
||||
pointer = 0;
|
||||
docs[pointer] = doc = termDocs.doc();
|
||||
freqs[pointer] = termDocs.freq();
|
||||
} else {
|
||||
doc = Integer.MAX_VALUE;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public Explanation explain(int doc) throws IOException {
|
||||
|
|
|
@ -226,98 +226,3 @@ public final class RAMDirectory extends Directory {
|
|||
public final void close() {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
final class RAMInputStream extends InputStream implements Cloneable {
|
||||
RAMFile file;
|
||||
int pointer = 0;
|
||||
|
||||
public RAMInputStream(RAMFile f) {
|
||||
file = f;
|
||||
length = file.length;
|
||||
}
|
||||
|
||||
/** InputStream methods */
|
||||
public final void readInternal(byte[] dest, int destOffset, int len) {
|
||||
int remainder = len;
|
||||
int start = pointer;
|
||||
while (remainder != 0) {
|
||||
int bufferNumber = start/InputStream.BUFFER_SIZE;
|
||||
int bufferOffset = start%InputStream.BUFFER_SIZE;
|
||||
int bytesInBuffer = InputStream.BUFFER_SIZE - bufferOffset;
|
||||
int bytesToCopy = bytesInBuffer >= remainder ? remainder : bytesInBuffer;
|
||||
byte[] buffer = (byte[])file.buffers.elementAt(bufferNumber);
|
||||
System.arraycopy(buffer, bufferOffset, dest, destOffset, bytesToCopy);
|
||||
destOffset += bytesToCopy;
|
||||
start += bytesToCopy;
|
||||
remainder -= bytesToCopy;
|
||||
}
|
||||
pointer += len;
|
||||
}
|
||||
|
||||
public final void close() {
|
||||
}
|
||||
|
||||
/** Random-access methods */
|
||||
public final void seekInternal(long pos) {
|
||||
pointer = (int)pos;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
final class RAMOutputStream extends OutputStream {
|
||||
RAMFile file;
|
||||
int pointer = 0;
|
||||
|
||||
public RAMOutputStream(RAMFile f) {
|
||||
file = f;
|
||||
}
|
||||
|
||||
/** output methods: */
|
||||
public final void flushBuffer(byte[] src, int len) {
|
||||
int bufferNumber = pointer/OutputStream.BUFFER_SIZE;
|
||||
int bufferOffset = pointer%OutputStream.BUFFER_SIZE;
|
||||
int bytesInBuffer = OutputStream.BUFFER_SIZE - bufferOffset;
|
||||
int bytesToCopy = bytesInBuffer >= len ? len : bytesInBuffer;
|
||||
|
||||
if (bufferNumber == file.buffers.size())
|
||||
file.buffers.addElement(new byte[OutputStream.BUFFER_SIZE]);
|
||||
|
||||
byte[] buffer = (byte[])file.buffers.elementAt(bufferNumber);
|
||||
System.arraycopy(src, 0, buffer, bufferOffset, bytesToCopy);
|
||||
|
||||
if (bytesToCopy < len) { // not all in one buffer
|
||||
int srcOffset = bytesToCopy;
|
||||
bytesToCopy = len - bytesToCopy; // remaining bytes
|
||||
bufferNumber++;
|
||||
if (bufferNumber == file.buffers.size())
|
||||
file.buffers.addElement(new byte[OutputStream.BUFFER_SIZE]);
|
||||
buffer = (byte[])file.buffers.elementAt(bufferNumber);
|
||||
System.arraycopy(src, srcOffset, buffer, 0, bytesToCopy);
|
||||
}
|
||||
pointer += len;
|
||||
if (pointer > file.length)
|
||||
file.length = pointer;
|
||||
|
||||
file.lastModified = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
public final void close() throws IOException {
|
||||
super.close();
|
||||
}
|
||||
|
||||
/** Random-access methods */
|
||||
public final void seek(long pos) throws IOException {
|
||||
super.seek(pos);
|
||||
pointer = (int)pos;
|
||||
}
|
||||
public final long length() throws IOException {
|
||||
return file.length;
|
||||
}
|
||||
}
|
||||
|
||||
final class RAMFile {
|
||||
Vector buffers = new Vector();
|
||||
long length;
|
||||
long lastModified = System.currentTimeMillis();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
package org.apache.lucene.store;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001, 2004 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.Vector;
|
||||
|
||||
class RAMFile {
|
||||
Vector buffers = new Vector();
|
||||
long length;
|
||||
long lastModified = System.currentTimeMillis();
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
package org.apache.lucene.store;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001, 2004 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A memory-resident {@link InputStream} implementation.
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
|
||||
class RAMInputStream extends InputStream implements Cloneable {
|
||||
private RAMFile file;
|
||||
private int pointer = 0;
|
||||
|
||||
public RAMInputStream(RAMFile f) {
|
||||
file = f;
|
||||
length = file.length;
|
||||
}
|
||||
|
||||
public void readInternal(byte[] dest, int destOffset, int len) {
|
||||
int remainder = len;
|
||||
int start = pointer;
|
||||
while (remainder != 0) {
|
||||
int bufferNumber = start/BUFFER_SIZE;
|
||||
int bufferOffset = start%BUFFER_SIZE;
|
||||
int bytesInBuffer = BUFFER_SIZE - bufferOffset;
|
||||
int bytesToCopy = bytesInBuffer >= remainder ? remainder : bytesInBuffer;
|
||||
byte[] buffer = (byte[])file.buffers.elementAt(bufferNumber);
|
||||
System.arraycopy(buffer, bufferOffset, dest, destOffset, bytesToCopy);
|
||||
destOffset += bytesToCopy;
|
||||
start += bytesToCopy;
|
||||
remainder -= bytesToCopy;
|
||||
}
|
||||
pointer += len;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
}
|
||||
|
||||
public void seekInternal(long pos) {
|
||||
pointer = (int)pos;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,145 @@
|
|||
package org.apache.lucene.store;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001, 2004 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A memory-resident {@link OutputStream} implementation.
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
|
||||
public class RAMOutputStream extends OutputStream {
|
||||
private RAMFile file;
|
||||
private int pointer = 0;
|
||||
|
||||
/** Construct an empty output buffer. */
|
||||
public RAMOutputStream() {
|
||||
this(new RAMFile());
|
||||
}
|
||||
|
||||
RAMOutputStream(RAMFile f) {
|
||||
file = f;
|
||||
}
|
||||
|
||||
/** Copy the current contents of this buffer to the named output. */
|
||||
public void writeTo(OutputStream out) throws IOException {
|
||||
flush();
|
||||
final long end = file.length;
|
||||
long pos = 0;
|
||||
int buffer = 0;
|
||||
while (pos < end) {
|
||||
int length = BUFFER_SIZE;
|
||||
long nextPos = pos + length;
|
||||
if (nextPos > end) { // at the last buffer
|
||||
length = (int)(end - pos);
|
||||
}
|
||||
out.writeBytes((byte[])file.buffers.elementAt(buffer++), length);
|
||||
pos = nextPos;
|
||||
}
|
||||
}
|
||||
|
||||
/** Resets this to an empty buffer. */
|
||||
public void reset() {
|
||||
try {
|
||||
seek(0);
|
||||
} catch (IOException e) { // should never happen
|
||||
throw new RuntimeException(e.toString());
|
||||
}
|
||||
|
||||
file.length = 0;
|
||||
}
|
||||
|
||||
public void flushBuffer(byte[] src, int len) {
|
||||
int bufferNumber = pointer/BUFFER_SIZE;
|
||||
int bufferOffset = pointer%BUFFER_SIZE;
|
||||
int bytesInBuffer = BUFFER_SIZE - bufferOffset;
|
||||
int bytesToCopy = bytesInBuffer >= len ? len : bytesInBuffer;
|
||||
|
||||
if (bufferNumber == file.buffers.size())
|
||||
file.buffers.addElement(new byte[BUFFER_SIZE]);
|
||||
|
||||
byte[] buffer = (byte[])file.buffers.elementAt(bufferNumber);
|
||||
System.arraycopy(src, 0, buffer, bufferOffset, bytesToCopy);
|
||||
|
||||
if (bytesToCopy < len) { // not all in one buffer
|
||||
int srcOffset = bytesToCopy;
|
||||
bytesToCopy = len - bytesToCopy; // remaining bytes
|
||||
bufferNumber++;
|
||||
if (bufferNumber == file.buffers.size())
|
||||
file.buffers.addElement(new byte[BUFFER_SIZE]);
|
||||
buffer = (byte[])file.buffers.elementAt(bufferNumber);
|
||||
System.arraycopy(src, srcOffset, buffer, 0, bytesToCopy);
|
||||
}
|
||||
pointer += len;
|
||||
if (pointer > file.length)
|
||||
file.length = pointer;
|
||||
|
||||
file.lastModified = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
}
|
||||
|
||||
public void seek(long pos) throws IOException {
|
||||
super.seek(pos);
|
||||
pointer = (int)pos;
|
||||
}
|
||||
public long length() {
|
||||
return file.length;
|
||||
}
|
||||
}
|
|
@ -54,6 +54,7 @@ package org.apache.lucene;
|
|||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.*;
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.analysis.*;
|
||||
|
@ -93,7 +94,7 @@ class ThreadSafetyTest {
|
|||
Document d = new Document();
|
||||
int n = RANDOM.nextInt();
|
||||
d.add(Field.Keyword("id", Integer.toString(n)));
|
||||
d.add(Field.UnStored("contents", intToEnglish(n)));
|
||||
d.add(Field.UnStored("contents", English.intToEnglish(n)));
|
||||
System.out.println("Adding " + n);
|
||||
|
||||
// Switch between single and multiple file segments
|
||||
|
@ -151,7 +152,7 @@ class ThreadSafetyTest {
|
|||
throws Exception {
|
||||
System.out.println("Searching for " + n);
|
||||
Hits hits =
|
||||
searcher.search(QueryParser.parse(intToEnglish(n), "contents",
|
||||
searcher.search(QueryParser.parse(English.intToEnglish(n), "contents",
|
||||
ANALYZER));
|
||||
System.out.println("Search for " + n + ": total=" + hits.length());
|
||||
for (int j = 0; j < Math.min(3, hits.length()); j++) {
|
||||
|
@ -197,76 +198,4 @@ class ThreadSafetyTest {
|
|||
SearcherThread searcherThread3 = new SearcherThread(true);
|
||||
searcherThread3.start();
|
||||
}
|
||||
|
||||
private static String intToEnglish(int i) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
intToEnglish(i, result);
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
private static void intToEnglish(int i, StringBuffer result) {
|
||||
if (i < 0) {
|
||||
result.append("minus ");
|
||||
i = -i;
|
||||
}
|
||||
if (i >= 1000000000) { // billions
|
||||
intToEnglish(i/1000000000, result);
|
||||
result.append("billion, ");
|
||||
i = i%1000000000;
|
||||
}
|
||||
if (i >= 1000000) { // millions
|
||||
intToEnglish(i/1000000, result);
|
||||
result.append("million, ");
|
||||
i = i%1000000;
|
||||
}
|
||||
if (i >= 1000) { // thousands
|
||||
intToEnglish(i/1000, result);
|
||||
result.append("thousand, ");
|
||||
i = i%1000;
|
||||
}
|
||||
if (i >= 100) { // hundreds
|
||||
intToEnglish(i/100, result);
|
||||
result.append("hundred ");
|
||||
i = i%100;
|
||||
}
|
||||
if (i >= 20) {
|
||||
switch (i/10) {
|
||||
case 9 : result.append("ninety"); break;
|
||||
case 8 : result.append("eighty"); break;
|
||||
case 7 : result.append("seventy"); break;
|
||||
case 6 : result.append("sixty"); break;
|
||||
case 5 : result.append("fifty"); break;
|
||||
case 4 : result.append("forty"); break;
|
||||
case 3 : result.append("thirty"); break;
|
||||
case 2 : result.append("twenty"); break;
|
||||
}
|
||||
i = i%10;
|
||||
if (i == 0)
|
||||
result.append(" ");
|
||||
else
|
||||
result.append("-");
|
||||
}
|
||||
switch (i) {
|
||||
case 19 : result.append("nineteen "); break;
|
||||
case 18 : result.append("eighteen "); break;
|
||||
case 17 : result.append("seventeen "); break;
|
||||
case 16 : result.append("sixteen "); break;
|
||||
case 15 : result.append("fifteen "); break;
|
||||
case 14 : result.append("fourteen "); break;
|
||||
case 13 : result.append("thirteen "); break;
|
||||
case 12 : result.append("twelve "); break;
|
||||
case 11 : result.append("eleven "); break;
|
||||
case 10 : result.append("ten "); break;
|
||||
case 9 : result.append("nine "); break;
|
||||
case 8 : result.append("eight "); break;
|
||||
case 7 : result.append("seven "); break;
|
||||
case 6 : result.append("six "); break;
|
||||
case 5 : result.append("five "); break;
|
||||
case 4 : result.append("four "); break;
|
||||
case 3 : result.append("three "); break;
|
||||
case 2 : result.append("two "); break;
|
||||
case 1 : result.append("one "); break;
|
||||
case 0 : result.append(""); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,135 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001, 2004 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
/**
|
||||
* Tests basic search capabilities.
|
||||
*
|
||||
* @author Doug Cutting
|
||||
*/
|
||||
public class TestBasics extends TestCase {
|
||||
private IndexSearcher searcher;
|
||||
|
||||
public void setUp() throws Exception {
|
||||
RAMDirectory directory = new RAMDirectory();
|
||||
IndexWriter writer
|
||||
= new IndexWriter(directory, new SimpleAnalyzer(), true);
|
||||
//writer.infoStream = System.out;
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(Field.Text("field", English.intToEnglish(i)));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
writer.close();
|
||||
|
||||
searcher = new IndexSearcher(directory);
|
||||
}
|
||||
|
||||
public void testTerm() throws Exception {
|
||||
Query query = new TermQuery(new Term("field", "seventy"));
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(100, hits.length());
|
||||
}
|
||||
|
||||
public void testTerm2() throws Exception {
|
||||
Query query = new TermQuery(new Term("field", "seventish"));
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(0, hits.length());
|
||||
}
|
||||
|
||||
public void testPhrase() throws Exception {
|
||||
PhraseQuery query = new PhraseQuery();
|
||||
query.add(new Term("field", "seventy"));
|
||||
query.add(new Term("field", "seven"));
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(10, hits.length());
|
||||
}
|
||||
|
||||
public void testPhrase2() throws Exception {
|
||||
PhraseQuery query = new PhraseQuery();
|
||||
query.add(new Term("field", "seventish"));
|
||||
query.add(new Term("field", "sevenon"));
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(0, hits.length());
|
||||
}
|
||||
|
||||
public void testBoolean() throws Exception {
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
query.add(new TermQuery(new Term("field", "seventy")), true, false);
|
||||
query.add(new TermQuery(new Term("field", "seven")), true, false);
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(19, hits.length());
|
||||
}
|
||||
|
||||
public void testBoolean2() throws Exception {
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
query.add(new TermQuery(new Term("field", "sevento")), true, false);
|
||||
query.add(new TermQuery(new Term("field", "sevenly")), true, false);
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(0, hits.length());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,140 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001, 2004 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
|
||||
public class English {
|
||||
|
||||
public static String intToEnglish(int i) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
intToEnglish(i, result);
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static void intToEnglish(int i, StringBuffer result) {
|
||||
if (i == 0) {
|
||||
result.append("zero");
|
||||
return;
|
||||
}
|
||||
if (i < 0) {
|
||||
result.append("minus ");
|
||||
i = -i;
|
||||
}
|
||||
if (i >= 1000000000) { // billions
|
||||
intToEnglish(i/1000000000, result);
|
||||
result.append("billion, ");
|
||||
i = i%1000000000;
|
||||
}
|
||||
if (i >= 1000000) { // millions
|
||||
intToEnglish(i/1000000, result);
|
||||
result.append("million, ");
|
||||
i = i%1000000;
|
||||
}
|
||||
if (i >= 1000) { // thousands
|
||||
intToEnglish(i/1000, result);
|
||||
result.append("thousand, ");
|
||||
i = i%1000;
|
||||
}
|
||||
if (i >= 100) { // hundreds
|
||||
intToEnglish(i/100, result);
|
||||
result.append("hundred ");
|
||||
i = i%100;
|
||||
}
|
||||
if (i >= 20) {
|
||||
switch (i/10) {
|
||||
case 9 : result.append("ninety"); break;
|
||||
case 8 : result.append("eighty"); break;
|
||||
case 7 : result.append("seventy"); break;
|
||||
case 6 : result.append("sixty"); break;
|
||||
case 5 : result.append("fifty"); break;
|
||||
case 4 : result.append("forty"); break;
|
||||
case 3 : result.append("thirty"); break;
|
||||
case 2 : result.append("twenty"); break;
|
||||
}
|
||||
i = i%10;
|
||||
if (i == 0)
|
||||
result.append(" ");
|
||||
else
|
||||
result.append("-");
|
||||
}
|
||||
switch (i) {
|
||||
case 19 : result.append("nineteen "); break;
|
||||
case 18 : result.append("eighteen "); break;
|
||||
case 17 : result.append("seventeen "); break;
|
||||
case 16 : result.append("sixteen "); break;
|
||||
case 15 : result.append("fifteen "); break;
|
||||
case 14 : result.append("fourteen "); break;
|
||||
case 13 : result.append("thirteen "); break;
|
||||
case 12 : result.append("twelve "); break;
|
||||
case 11 : result.append("eleven "); break;
|
||||
case 10 : result.append("ten "); break;
|
||||
case 9 : result.append("nine "); break;
|
||||
case 8 : result.append("eight "); break;
|
||||
case 7 : result.append("seven "); break;
|
||||
case 6 : result.append("six "); break;
|
||||
case 5 : result.append("five "); break;
|
||||
case 4 : result.append("four "); break;
|
||||
case 3 : result.append("three "); break;
|
||||
case 2 : result.append("two "); break;
|
||||
case 1 : result.append("one "); break;
|
||||
case 0 : result.append(""); break;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
System.out.println(intToEnglish(Integer.parseInt(args[0])));
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue