diff --git a/CHANGES.txt b/CHANGES.txt index 47fa790b523..42be8b0d855 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,36 @@ Lucene Change Log $Id$ +1.4 RC1 + + 1. Changed the format of the .tis file, so that: + + - it has a format version number, which makes it easier to + back-compatibly change file formats in the future. + + - the term count is now stored as a long. This was the one aspect + of the Lucene's file formats which limited index size. + + - a few internal index parameters are now stored in the index, so + that they can (in theory) now be changed from index to index, + although there is not yet an API to do so. + + These changes are back compatible. The new code can read old + indexes. But old code will not be able read new indexes. (cutting) + + 2. Added an optimized implementation of TermDocs.skipTo(). A skip + table is now stored for each term in the .frq file. This only + adds a percent or two to overall index size, but can substantially + speedup many searches. (cutting) + + 3. Restructured the Scorer API and all Scorer implementations to take + advantage of an optimized TermDocs.skipTo() implementation. In + particular, PhraseQuerys and conjunctive BooleanQuerys are + faster when one clause has substantially fewer matches than the + others. (A conjunctive BooleanQuery is a BooleanQuery where all + clauses are required.) (cutting) + + 1.3 final 1. Added catch of BooleanQuery$TooManyClauses in QueryParser to diff --git a/src/java/org/apache/lucene/index/DocumentWriter.java b/src/java/org/apache/lucene/index/DocumentWriter.java index eb4c822d6ca..bc9c1454b6b 100644 --- a/src/java/org/apache/lucene/index/DocumentWriter.java +++ b/src/java/org/apache/lucene/index/DocumentWriter.java @@ -291,7 +291,7 @@ final class DocumentWriter { Posting posting = postings[i]; // add an entry to the dictionary with pointers to prox and freq files - ti.set(1, freq.getFilePointer(), prox.getFilePointer()); + ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1); tis.add(posting.term, ti); // add an entry to the freq file diff --git a/src/java/org/apache/lucene/index/SegmentMerger.java b/src/java/org/apache/lucene/index/SegmentMerger.java index 76a42c7037c..08afc05260a 100644 --- a/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/src/java/org/apache/lucene/index/SegmentMerger.java @@ -62,6 +62,7 @@ import java.io.IOException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.OutputStream; import org.apache.lucene.store.InputStream; +import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.BitVector; final class SegmentMerger { @@ -246,17 +247,21 @@ final class SegmentMerger { int df = appendPostings(smis, n); // append posting data + long skipPointer = writeSkip(); + if (df > 0) { // add an entry to the dictionary with pointers to prox and freq files - termInfo.set(df, freqPointer, proxPointer); + termInfo.set(df, freqPointer, proxPointer, (int)(skipPointer-freqPointer)); termInfosWriter.add(smis[0].term, termInfo); } } private final int appendPostings(SegmentMergeInfo[] smis, int n) throws IOException { + final int skipInterval = termInfosWriter.skipInterval; int lastDoc = 0; int df = 0; // number of docs w/ term + resetSkip(); for (int i = 0; i < n; i++) { SegmentMergeInfo smi = smis[i]; TermPositions postings = smi.postings; @@ -272,6 +277,12 @@ final class SegmentMerger { if (doc < lastDoc) throw new IllegalStateException("docs out of order"); + df++; + + if ((df % skipInterval) == 0) { + bufferSkip(lastDoc); + } + int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1 lastDoc = doc; @@ -289,13 +300,43 @@ final class SegmentMerger { proxOutput.writeVInt(position - lastPosition); lastPosition = position; } - - df++; } } return df; } - private final void mergeNorms() throws IOException { + + private RAMOutputStream skipBuffer = new RAMOutputStream(); + private int lastSkipDoc; + private long lastSkipFreqPointer; + private long lastSkipProxPointer; + + private void resetSkip() throws IOException { + skipBuffer.reset(); + lastSkipDoc = 0; + lastSkipFreqPointer = freqOutput.getFilePointer(); + lastSkipProxPointer = proxOutput.getFilePointer(); + } + + private void bufferSkip(int doc) throws IOException { + long freqPointer = freqOutput.getFilePointer(); + long proxPointer = proxOutput.getFilePointer(); + + skipBuffer.writeVInt(doc - lastSkipDoc); + skipBuffer.writeVInt((int)(freqPointer - lastSkipFreqPointer)); + skipBuffer.writeVInt((int)(proxPointer - lastSkipProxPointer)); + + lastSkipDoc = doc; + lastSkipFreqPointer = freqPointer; + lastSkipProxPointer = proxPointer; + } + + private long writeSkip() throws IOException { + long skipPointer = freqOutput.getFilePointer(); + skipBuffer.writeTo(freqOutput); + return skipPointer; + } + + private void mergeNorms() throws IOException { for (int i = 0; i < fieldInfos.size(); i++) { FieldInfo fi = fieldInfos.fieldInfo(i); if (fi.isIndexed) { diff --git a/src/java/org/apache/lucene/index/SegmentTermDocs.java b/src/java/org/apache/lucene/index/SegmentTermDocs.java index cd7a9384033..44dffadc529 100644 --- a/src/java/org/apache/lucene/index/SegmentTermDocs.java +++ b/src/java/org/apache/lucene/index/SegmentTermDocs.java @@ -61,16 +61,27 @@ import org.apache.lucene.store.InputStream; class SegmentTermDocs implements TermDocs { protected SegmentReader parent; private InputStream freqStream; - private int freqCount; + private int count; + private int df; private BitVector deletedDocs; int doc = 0; int freq; + private int skipInterval; + private int skipCount; + private InputStream skipStream; + private int skipDoc; + private long freqPointer; + private long proxPointer; + private long skipPointer; + private boolean haveSkipped; + SegmentTermDocs(SegmentReader parent) throws IOException { this.parent = parent; this.freqStream = (InputStream)parent.freqStream.clone(); this.deletedDocs = parent.deletedDocs; + this.skipInterval = parent.tis.getSkipInterval(); } public void seek(Term term) throws IOException { @@ -88,12 +99,19 @@ class SegmentTermDocs implements TermDocs { } void seek(TermInfo ti) throws IOException { + count = 0; if (ti == null) { - freqCount = 0; + df = 0; } else { - freqCount = ti.docFreq; + df = ti.docFreq; doc = 0; - freqStream.seek(ti.freqPointer); + skipDoc = 0; + skipCount = 0; + freqPointer = ti.freqPointer; + proxPointer = ti.proxPointer; + skipPointer = freqPointer + ti.skipOffset; + freqStream.seek(freqPointer); + haveSkipped = false; } } @@ -109,7 +127,7 @@ class SegmentTermDocs implements TermDocs { public boolean next() throws IOException { while (true) { - if (freqCount == 0) + if (count == df) return false; int docCode = freqStream.readVInt(); @@ -119,7 +137,7 @@ class SegmentTermDocs implements TermDocs { else freq = freqStream.readVInt(); // else read freq - freqCount--; + count++; if (deletedDocs == null || !deletedDocs.get(doc)) break; @@ -131,9 +149,9 @@ class SegmentTermDocs implements TermDocs { /** Optimized implementation. */ public int read(final int[] docs, final int[] freqs) throws IOException { - final int end = docs.length; + final int length = docs.length; int i = 0; - while (i < end && freqCount > 0) { + while (i < length && count < df) { // manually inlined call to next() for speed final int docCode = freqStream.readVInt(); @@ -142,7 +160,7 @@ class SegmentTermDocs implements TermDocs { freq = 1; // freq is one else freq = freqStream.readVInt(); // else read freq - freqCount--; + count++; if (deletedDocs == null || !deletedDocs.get(doc)) { docs[i] = doc; @@ -153,12 +171,61 @@ class SegmentTermDocs implements TermDocs { return i; } - /** As yet unoptimized implementation. */ + /** Overridden by SegmentTermPositions to skip in prox stream. */ + protected void skipProx(long proxPointer) throws IOException {} + + /** Optimized implementation. */ public boolean skipTo(int target) throws IOException { + if (df > skipInterval) { // optimized case + + if (skipStream == null) + skipStream = (InputStream)freqStream.clone(); // lazily clone + + if (!haveSkipped) { // lazily seek skip stream + skipStream.seek(skipPointer); + haveSkipped = true; + } + + // scan skip data + int lastSkipDoc = skipDoc; + long lastFreqPointer = freqStream.getFilePointer(); + long lastProxPointer = -1; + int numSkipped = -1 -(count % skipInterval); + + while (target > skipDoc) { + lastSkipDoc = skipDoc; + lastFreqPointer = freqPointer; + lastProxPointer = proxPointer; + if (skipDoc >= doc) + numSkipped += skipInterval; + + if ((count + numSkipped + skipInterval) > df) + break; // no more skips + + skipDoc += skipStream.readVInt(); + freqPointer += skipStream.readVInt(); + proxPointer += skipStream.readVInt(); + + skipCount++; + } + + // if we found something to skip, then skip it + if (lastFreqPointer > freqStream.getFilePointer()) { + freqStream.seek(lastFreqPointer); + skipProx(lastProxPointer); + + doc = lastSkipDoc; + count += numSkipped; + } + + } + + // done skipping, now just scan do { if (!next()) return false; } while (target > doc); return true; } + } diff --git a/src/java/org/apache/lucene/index/SegmentTermEnum.java b/src/java/org/apache/lucene/index/SegmentTermEnum.java index ef8e234a9fb..17262832516 100644 --- a/src/java/org/apache/lucene/index/SegmentTermEnum.java +++ b/src/java/org/apache/lucene/index/SegmentTermEnum.java @@ -60,14 +60,17 @@ import org.apache.lucene.store.InputStream; final class SegmentTermEnum extends TermEnum implements Cloneable { private InputStream input; private FieldInfos fieldInfos; - int size; - int position = -1; + long size; + long position = -1; private Term term = new Term("", ""); private TermInfo termInfo = new TermInfo(); - boolean isIndex = false; + private int format; + private boolean isIndex = false; long indexPointer = 0; + int indexInterval; + int skipInterval; Term prev; private char[] buffer = {}; @@ -76,8 +79,34 @@ final class SegmentTermEnum extends TermEnum implements Cloneable { throws IOException { input = i; fieldInfos = fis; - size = input.readInt(); isIndex = isi; + + int firstInt = input.readInt(); + if (firstInt >= 0) { + // original-format file, without explicit format version number + format = 0; + size = firstInt; + + // back-compatible settings + indexInterval = 128; + skipInterval = Integer.MAX_VALUE; + + } else { + // we have a format version number + format = firstInt; + + // check that it is a format we can understand + if (format < TermInfosWriter.FORMAT) + throw new IOException("Unknown format version:" + format); + + size = input.readLong(); // read the size + + if (!isIndex) { + indexInterval = input.readInt(); + skipInterval = input.readInt(); + } + } + } protected Object clone() { @@ -117,6 +146,12 @@ final class SegmentTermEnum extends TermEnum implements Cloneable { termInfo.freqPointer += input.readVLong(); // read freq pointer termInfo.proxPointer += input.readVLong(); // read prox pointer + if (!isIndex) { + if (termInfo.docFreq > skipInterval) { + termInfo.skipOffset = input.readVInt(); + } + } + if (isIndex) indexPointer += input.readVLong(); // read index pointer diff --git a/src/java/org/apache/lucene/index/SegmentTermPositions.java b/src/java/org/apache/lucene/index/SegmentTermPositions.java index 5fc487ccbb8..6b148a4e0ba 100644 --- a/src/java/org/apache/lucene/index/SegmentTermPositions.java +++ b/src/java/org/apache/lucene/index/SegmentTermPositions.java @@ -109,4 +109,11 @@ extends SegmentTermDocs implements TermPositions { throw new UnsupportedOperationException(); } + + /** Called by super.skipTo(). */ + protected void skipProx(long proxPointer) throws IOException { + proxStream.seek(proxPointer); + proxCount = 0; + } + } diff --git a/src/java/org/apache/lucene/index/TermInfo.java b/src/java/org/apache/lucene/index/TermInfo.java index 91c974aa6d4..f13a9b7c6e1 100644 --- a/src/java/org/apache/lucene/index/TermInfo.java +++ b/src/java/org/apache/lucene/index/TermInfo.java @@ -62,6 +62,7 @@ final class TermInfo { long freqPointer = 0; long proxPointer = 0; + int skipOffset; TermInfo() {} @@ -75,17 +76,21 @@ final class TermInfo { docFreq = ti.docFreq; freqPointer = ti.freqPointer; proxPointer = ti.proxPointer; + skipOffset = ti.skipOffset; } - final void set(int df, long fp, long pp) { - docFreq = df; - freqPointer = fp; - proxPointer = pp; + final void set(int docFreq, + long freqPointer, long proxPointer, int skipOffset) { + this.docFreq = docFreq; + this.freqPointer = freqPointer; + this.proxPointer = proxPointer; + this.skipOffset = skipOffset; } final void set(TermInfo ti) { docFreq = ti.docFreq; freqPointer = ti.freqPointer; proxPointer = ti.proxPointer; + skipOffset = ti.skipOffset; } } diff --git a/src/java/org/apache/lucene/index/TermInfosReader.java b/src/java/org/apache/lucene/index/TermInfosReader.java index c544b619e48..5393d55974e 100644 --- a/src/java/org/apache/lucene/index/TermInfosReader.java +++ b/src/java/org/apache/lucene/index/TermInfosReader.java @@ -68,7 +68,7 @@ final class TermInfosReader { private FieldInfos fieldInfos; private SegmentTermEnum enumerator; - private int size; + private long size; TermInfosReader(Directory dir, String seg, FieldInfos fis) throws IOException { @@ -82,13 +82,17 @@ final class TermInfosReader { readIndex(); } + public int getSkipInterval() { + return enumerator.skipInterval; + } + final void close() throws IOException { if (enumerator != null) enumerator.close(); } /** Returns the number of term/value pairs in the set. */ - final int size() { + final long size() { return size; } @@ -101,7 +105,7 @@ final class TermInfosReader { new SegmentTermEnum(directory.openFile(segment + ".tii"), fieldInfos, true); try { - int indexSize = indexEnum.size; + int indexSize = (int)indexEnum.size; indexTerms = new Term[indexSize]; indexInfos = new TermInfo[indexSize]; @@ -137,7 +141,7 @@ final class TermInfosReader { private final void seekEnum(int indexOffset) throws IOException { enumerator.seek(indexPointers[indexOffset], - (indexOffset * TermInfosWriter.INDEX_INTERVAL) - 1, + (indexOffset * enumerator.indexInterval) - 1, indexTerms[indexOffset], indexInfos[indexOffset]); } @@ -146,10 +150,10 @@ final class TermInfosReader { if (size == 0) return null; // optimize sequential access: first try scanning cached enumerator w/o seeking - if (enumerator.term() != null // term is at or past current + if (enumerator.term() != null // term is at or past current && ((enumerator.prev != null && term.compareTo(enumerator.prev) > 0) || term.compareTo(enumerator.term()) >= 0)) { - int enumOffset = (enumerator.position/TermInfosWriter.INDEX_INTERVAL)+1; + int enumOffset = (int)(enumerator.position/enumerator.indexInterval)+1; if (indexTerms.length == enumOffset // but before end of block || term.compareTo(indexTerms[enumOffset]) < 0) return scanEnum(term); // no need to seek @@ -174,10 +178,10 @@ final class TermInfosReader { if (size == 0) return null; if (enumerator != null && enumerator.term() != null && position >= enumerator.position && - position < (enumerator.position + TermInfosWriter.INDEX_INTERVAL)) + position < (enumerator.position + enumerator.indexInterval)) return scanEnum(position); // can avoid seek - seekEnum(position / TermInfosWriter.INDEX_INTERVAL); // must seek + seekEnum(position / enumerator.indexInterval); // must seek return scanEnum(position); } @@ -190,7 +194,7 @@ final class TermInfosReader { } /** Returns the position of a Term in the set or -1. */ - final synchronized int getPosition(Term term) throws IOException { + final synchronized long getPosition(Term term) throws IOException { if (size == 0) return -1; int indexOffset = getIndexOffset(term); diff --git a/src/java/org/apache/lucene/index/TermInfosWriter.java b/src/java/org/apache/lucene/index/TermInfosWriter.java index a8a79f769e4..684ec9d055c 100644 --- a/src/java/org/apache/lucene/index/TermInfosWriter.java +++ b/src/java/org/apache/lucene/index/TermInfosWriter.java @@ -62,13 +62,36 @@ import org.apache.lucene.store.Directory; Directory. A TermInfos can be written once, in order. */ final class TermInfosWriter { + /** The file format version, a negative number. */ + public static final int FORMAT = -1; + private FieldInfos fieldInfos; private OutputStream output; private Term lastTerm = new Term("", ""); private TermInfo lastTi = new TermInfo(); private int size = 0; - static final int INDEX_INTERVAL = 128; + // TODO: the default values for these two parameters should be settable from + // IndexWriter. However, once that's done, folks will start setting them to + // ridiculous values and complaining that things don't work well, as with + // mergeFactor. So, let's wait until a number of folks find that alternate + // values work better. Note that both of these values are stored in the + // segment, so that it's safe to change these w/o rebuilding all indexes. + + /** Expert: The fraction of terms in the "dictionary" which should be stored + * in RAM. Smaller values use more memory, but make searching slightly + * faster, while larger values use less memory and make searching slightly + * slower. Searching is typically not dominated by dictionary lookup, so + * tweaking this is rarely useful.*/ + int indexInterval = 128; + + /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, + * used to accellerate {@link TermDocs#skipTo(int)}. Larger values result in + * smaller indexes, greater acceleration, but fewer accelerable cases, while + * smaller values result in bigger indexes, less acceleration and more + * accelerable cases. More detailed experiments would be useful here. */ + int skipInterval = 16; + private long lastIndexPointer = 0; private boolean isIndex = false; @@ -91,7 +114,12 @@ final class TermInfosWriter { fieldInfos = fis; isIndex = isi; output = directory.createFile(segment + (isIndex ? ".tii" : ".tis")); - output.writeInt(0); // leave space for size + output.writeInt(FORMAT); // write format + output.writeLong(0); // leave space for size + if (!isIndex) { + output.writeInt(indexInterval); // write indexInterval + output.writeInt(skipInterval); // write skipInterval + } } /** Adds a new pair to the set. @@ -106,7 +134,7 @@ final class TermInfosWriter { if (ti.proxPointer < lastTi.proxPointer) throw new IOException("proxPointer out of order"); - if (!isIndex && size % INDEX_INTERVAL == 0) + if (!isIndex && size % indexInterval == 0) other.add(lastTerm, lastTi); // add an index term writeTerm(term); // write term @@ -114,6 +142,12 @@ final class TermInfosWriter { output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers output.writeVLong(ti.proxPointer - lastTi.proxPointer); + if (!isIndex) { + if (ti.docFreq > skipInterval) { + output.writeVInt(ti.skipOffset); + } + } + if (isIndex) { output.writeVLong(other.output.getFilePointer() - lastIndexPointer); lastIndexPointer = other.output.getFilePointer(); // write pointer @@ -149,8 +183,8 @@ final class TermInfosWriter { /** Called to complete TermInfos creation. */ final void close() throws IOException { - output.seek(0); // write size at start - output.writeInt(size); + output.seek(4); // write size after format + output.writeLong(size); output.close(); if (!isIndex) diff --git a/src/java/org/apache/lucene/search/BooleanQuery.java b/src/java/org/apache/lucene/search/BooleanQuery.java index 66fa15d0d39..8cae4e632ff 100644 --- a/src/java/org/apache/lucene/search/BooleanQuery.java +++ b/src/java/org/apache/lucene/search/BooleanQuery.java @@ -158,6 +158,37 @@ public class BooleanQuery extends Query { } public Scorer scorer(IndexReader reader) throws IOException { + // First see if the (faster) ConjunctionScorer will work. This can be + // used when all clauses are required. Also, at this point a + // BooleanScorer cannot be embedded in a ConjunctionScorer, as the hits + // from a BooleanScorer are not always sorted by document number (sigh) + // and hence BooleanScorer cannot implement skipTo() correctly, which is + // required by ConjunctionScorer. + boolean allRequired = true; + boolean noneBoolean = true; + for (int i = 0 ; i < weights.size(); i++) { + BooleanClause c = (BooleanClause)clauses.elementAt(i); + if (!c.required) + allRequired = false; + if (c.query instanceof BooleanQuery) + noneBoolean = false; + } + + if (allRequired && noneBoolean) { // ConjunctionScorer is okay + ConjunctionScorer result = + new ConjunctionScorer(searcher.getSimilarity()); + for (int i = 0 ; i < weights.size(); i++) { + BooleanClause c = (BooleanClause)clauses.elementAt(i); + Weight w = (Weight)weights.elementAt(i); + Scorer subScorer = w.scorer(reader); + if (subScorer == null) + return null; + result.add(subScorer); + } + return result; + } + + // Use good-old BooleanScorer instead. BooleanScorer result = new BooleanScorer(searcher.getSimilarity()); for (int i = 0 ; i < weights.size(); i++) { diff --git a/src/java/org/apache/lucene/search/BooleanScorer.java b/src/java/org/apache/lucene/search/BooleanScorer.java index 8d2bf0eae84..d822780d358 100644 --- a/src/java/org/apache/lucene/search/BooleanScorer.java +++ b/src/java/org/apache/lucene/search/BooleanScorer.java @@ -76,14 +76,17 @@ final class BooleanScorer extends Scorer { static final class SubScorer { public Scorer scorer; + public boolean done; public boolean required = false; public boolean prohibited = false; public HitCollector collector; public SubScorer next; public SubScorer(Scorer scorer, boolean required, boolean prohibited, - HitCollector collector, SubScorer next) { + HitCollector collector, SubScorer next) + throws IOException { this.scorer = scorer; + this.done = !scorer.next(); this.required = required; this.prohibited = prohibited; this.collector = collector; @@ -91,7 +94,8 @@ final class BooleanScorer extends Scorer { } } - final void add(Scorer scorer, boolean required, boolean prohibited) { + final void add(Scorer scorer, boolean required, boolean prohibited) + throws IOException { int mask = 0; if (required || prohibited) { if (nextMask == 0) @@ -120,17 +124,45 @@ final class BooleanScorer extends Scorer { coordFactors[i] = getSimilarity().coord(i, maxCoord-1); } - public final void score(HitCollector results, int maxDoc) - throws IOException { + private int end; + private Bucket current; + + public int doc() { return current.doc; } + + public boolean next() throws IOException { + boolean more = false; + do { + while (bucketTable.first != null) { // more queued + current = bucketTable.first; + bucketTable.first = current.next; // pop the queue + + // check prohibited & required + if ((current.bits & prohibitedMask) == 0 && + (current.bits & requiredMask) == requiredMask) { + return true; + } + } + + // refill the queue + end += BucketTable.SIZE; + for (SubScorer sub = scorers; sub != null; sub = sub.next) { + Scorer scorer = sub.scorer; + while (!sub.done && scorer.doc() < end) { + sub.collector.collect(scorer.doc(), scorer.score()); + sub.done = !scorer.next(); + } + if (!sub.done) { + more = true; + } + } + } while (bucketTable.first != null | more); + return false; + } + + public float score() throws IOException { if (coordFactors == null) computeCoordFactors(); - - while (currentDoc < maxDoc) { - currentDoc = Math.min(currentDoc+BucketTable.SIZE, maxDoc); - for (SubScorer t = scorers; t != null; t = t.next) - t.scorer.score(t.collector, currentDoc); - bucketTable.collectHits(results); - } + return current.score * coordFactors[current.coord]; } static final class Bucket { @@ -196,7 +228,7 @@ final class BooleanScorer extends Scorer { bucket.score = score; // initialize score bucket.bits = mask; // initialize mask bucket.coord = 1; // initialize coord - + bucket.next = table.first; // push onto valid list table.first = bucket; } else { // valid bucket @@ -207,6 +239,10 @@ final class BooleanScorer extends Scorer { } } + public boolean skipTo(int target) throws IOException { + throw new UnsupportedOperationException(); + } + public Explanation explain(int doc) throws IOException { throw new UnsupportedOperationException(); } diff --git a/src/java/org/apache/lucene/search/ConjunctionScorer.java b/src/java/org/apache/lucene/search/ConjunctionScorer.java new file mode 100644 index 00000000000..57faf0982f0 --- /dev/null +++ b/src/java/org/apache/lucene/search/ConjunctionScorer.java @@ -0,0 +1,155 @@ +package org.apache.lucene.search; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2004 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import java.io.IOException; +import java.util.*; +import org.apache.lucene.index.*; + +/** Scorer for conjunctions, sets of queries, all of which are required. */ +final class ConjunctionScorer extends Scorer { + private LinkedList scorers = new LinkedList(); + private boolean firstTime = true; + private boolean more = true; + private float coord; + + public ConjunctionScorer(Similarity similarity) { + super(similarity); + } + + final void add(Scorer scorer) throws IOException { + scorers.addLast(scorer); + } + + private Scorer first() { return (Scorer)scorers.getFirst(); } + private Scorer last() { return (Scorer)scorers.getLast(); } + + public int doc() { return first().doc(); } + + public boolean next() throws IOException { + if (firstTime) { + init(); + } else if (more) { + more = last().next(); // trigger further scanning + } + + while (more && first().doc() < last().doc()) { // find doc w/ all clauses + more = first().skipTo(last().doc()); // skip first upto last + scorers.addLast(scorers.removeFirst()); // move first to last + } + + return more; // found a doc with all clauses + } + + public boolean skipTo(int target) throws IOException { + Iterator i = scorers.iterator(); + while (more && i.hasNext()) { + more = ((Scorer)i.next()).skipTo(target); + } + if (more) + sortScorers(); // re-sort scorers + return more; + } + + public float score() throws IOException { + float score = 0.0f; // sum scores + Iterator i = scorers.iterator(); + while (i.hasNext()) + score += ((Scorer)i.next()).score(); + score *= coord; + return score; + } + + private void init() throws IOException { + more = scorers.size() > 0; + + // compute coord factor + coord = getSimilarity().coord(scorers.size(), scorers.size()); + + // move each scorer to its first entry + Iterator i = scorers.iterator(); + while (more && i.hasNext()) { + more = ((Scorer)i.next()).next(); + } + if (more) + sortScorers(); // initial sort of list + + firstTime = false; + } + + private void sortScorers() throws IOException { + // move scorers to an array + Scorer[] array = (Scorer[])scorers.toArray(new Scorer[scorers.size()]); + scorers.clear(); // empty the list + + Arrays.sort(array, new Comparator() { // sort the array + public int compare(Object o1, Object o2) { + return ((Scorer)o1).doc() - ((Scorer)o2).doc(); + } + public boolean equals(Object o1, Object o2) { + return ((Scorer)o1).doc() == ((Scorer)o2).doc(); + } + }); + + for (int i = 0; i < array.length; i++) { + scorers.addLast(array[i]); // re-build list, now sorted + } + } + + public Explanation explain(int doc) throws IOException { + throw new UnsupportedOperationException(); + } + +} diff --git a/src/java/org/apache/lucene/search/IndexSearcher.java b/src/java/org/apache/lucene/search/IndexSearcher.java index b03094ca56e..f152bfe3114 100644 --- a/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/src/java/org/apache/lucene/search/IndexSearcher.java @@ -140,7 +140,7 @@ public class IndexSearcher extends Searcher { hq.insert(new ScoreDoc(doc, score)); } } - }, reader.maxDoc()); + }); ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()]; for (int i = hq.size()-1; i >= 0; i--) // put docs in array @@ -180,7 +180,7 @@ public class IndexSearcher extends Searcher { Scorer scorer = query.weight(this).scorer(reader); if (scorer == null) return; - scorer.score(collector, reader.maxDoc()); + scorer.score(collector); } public Query rewrite(Query original) throws IOException { diff --git a/src/java/org/apache/lucene/search/PhrasePositions.java b/src/java/org/apache/lucene/search/PhrasePositions.java index adfb59e63b2..41c8b961da1 100644 --- a/src/java/org/apache/lucene/search/PhrasePositions.java +++ b/src/java/org/apache/lucene/search/PhrasePositions.java @@ -68,19 +68,31 @@ final class PhrasePositions { PhrasePositions(TermPositions t, int o) throws IOException { tp = t; offset = o; - next(); } - final void next() throws IOException { // increments to next doc + final boolean next() throws IOException { // increments to next doc if (!tp.next()) { tp.close(); // close stream doc = Integer.MAX_VALUE; // sentinel value - return; + return false; } doc = tp.doc(); position = 0; + return true; } + final boolean skipTo(int target) throws IOException { + if (!tp.skipTo(target)) { + tp.close(); // close stream + doc = Integer.MAX_VALUE; // sentinel value + return false; + } + doc = tp.doc(); + position = 0; + return true; + } + + final void firstPosition() throws IOException { count = tp.freq(); // read first pos nextPosition(); diff --git a/src/java/org/apache/lucene/search/PhraseScorer.java b/src/java/org/apache/lucene/search/PhraseScorer.java index 00ae443b970..82b152d6b8c 100644 --- a/src/java/org/apache/lucene/search/PhraseScorer.java +++ b/src/java/org/apache/lucene/search/PhraseScorer.java @@ -60,89 +60,127 @@ import org.apache.lucene.util.*; import org.apache.lucene.index.*; abstract class PhraseScorer extends Scorer { - private Weight weight; - protected byte[] norms; - protected float value; + private Weight weight; + protected byte[] norms; + protected float value; - protected PhraseQueue pq; - protected PhrasePositions first, last; + private boolean firstTime = true; + private boolean more = true; + protected PhraseQueue pq; + protected PhrasePositions first, last; - private float freq; + private float freq; - PhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity, - byte[] norms) throws IOException { - super(similarity); - this.norms = norms; - this.weight = weight; - this.value = weight.getValue(); + PhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity, + byte[] norms) throws IOException { + super(similarity); + this.norms = norms; + this.weight = weight; + this.value = weight.getValue(); - // use PQ to build a sorted list of PhrasePositions - pq = new PhraseQueue(tps.length); - for (int i = 0; i < tps.length; i++) { - pq.put(new PhrasePositions(tps[i], i)); - } - pqToList(); + // convert tps to a list + for (int i = 0; i < tps.length; i++) { + PhrasePositions pp = new PhrasePositions(tps[i], i); + if (last != null) { // add next to end of list + last.next = pp; + } else + first = pp; + last = pp; } - public final void score(HitCollector results, int end) throws IOException { - Similarity similarity = getSimilarity(); - while (last.doc < end) { // find doc w/ all the terms - while (first.doc < last.doc) { // scan forward in first - do { - first.next(); - } while (first.doc < last.doc); - firstToLast(); - if (last.doc >= end) - return; - } + pq = new PhraseQueue(tps.length); // construct empty pq - // found doc with all terms - freq = phraseFreq(); // check for phrase + } - if (freq > 0.0) { - float score = similarity.tf(freq) * value; // compute score - score *= Similarity.decodeNorm(norms[first.doc]); // normalize - results.collect(first.doc, score); // add to results - } - last.next(); // resume scanning - } + public int doc() { return first.doc; } + + public boolean next() throws IOException { + if (firstTime) { + sort(); + firstTime = false; + } else if (more) { + more = last.next(); // trigger further scanning } - protected abstract float phraseFreq() throws IOException; + while (more) { + while (more && first.doc < last.doc) { // find doc w/ all the terms + more = first.skipTo(last.doc); // skip first upto last + firstToLast(); // and move it to the end + } - protected final void pqToList() { - last = first = null; - while (pq.top() != null) { - PhrasePositions pp = (PhrasePositions) pq.pop(); - if (last != null) { // add next to end of list - last.next = pp; - } else - first = pp; - last = pp; - pp.next = null; - } + if (more) { + // found a doc with all of the terms + freq = phraseFreq(); // check for phrase + if (freq == 0.0f) // no match + more = last.next(); // trigger further scanning + else + return true; // found a match + } } + return false; // no more matches + } - protected final void firstToLast() { - last.next = first; // move first to end of list - last = first; - first = first.next; - last.next = null; + public float score() throws IOException { + //System.out.println("scoring " + first.doc); + float raw = getSimilarity().tf(freq) * value; // raw score + return raw * Similarity.decodeNorm(norms[first.doc]); // normalize + } + + public boolean skipTo(int target) throws IOException { + for (PhrasePositions pp = first; more && pp != null; pp = pp.next) { + more = pp.skipTo(target); } + if (more) + sort(); // re-sort + return more; + } - public Explanation explain(final int doc) throws IOException { - Explanation tfExplanation = new Explanation(); - score(new HitCollector() { - public final void collect(int d, float score) { - } - }, doc + 1); + protected abstract float phraseFreq() throws IOException; - float phraseFreq = (first.doc == doc) ? freq : 0.0f; - tfExplanation.setValue(getSimilarity().tf(phraseFreq)); - tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); - - return tfExplanation; + private void sort() throws IOException { + pq.clear(); + for (PhrasePositions pp = first; more && pp != null; pp = pp.next) { + more = pp.next(); + if (more) { + pq.put(pp); + } else { + return; + } } + pqToList(); + } + + protected final void pqToList() { + last = first = null; + while (pq.top() != null) { + PhrasePositions pp = (PhrasePositions) pq.pop(); + if (last != null) { // add next to end of list + last.next = pp; + } else + first = pp; + last = pp; + pp.next = null; + } + } + + protected final void firstToLast() { + last.next = first; // move first to end of list + last = first; + first = first.next; + last.next = null; + } + + public Explanation explain(final int doc) throws IOException { + Explanation tfExplanation = new Explanation(); + + while (next() && doc() < doc) {} + + float phraseFreq = (doc() == doc) ? freq : 0.0f; + tfExplanation.setValue(getSimilarity().tf(phraseFreq)); + tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); + + return tfExplanation; + } } diff --git a/src/java/org/apache/lucene/search/Scorer.java b/src/java/org/apache/lucene/search/Scorer.java index 68d03a41de0..0117f91e2b9 100644 --- a/src/java/org/apache/lucene/search/Scorer.java +++ b/src/java/org/apache/lucene/search/Scorer.java @@ -70,11 +70,39 @@ public abstract class Scorer { return this.similarity; } - /** Scores hits and passes them to a collector. Stops at the last document - * before maxDoc. If called repeatedly, will restart at point - * where it last left off. + /** Scores all documents and passes them to a collector. */ + public void score(HitCollector hc) throws IOException { + while (next()) { + hc.collect(doc(), score()); + } + } + + /** Advance to the next document matching the query. Returns true iff there + * is another match. */ + public abstract boolean next() throws IOException; + + /** Returns the current document number. Initially invalid, until {@link + * #next()} is called the first time. */ + public abstract int doc(); + + /** Returns the score of the current document. Initially invalid, until + * {@link #next()} is called the first time. */ + public abstract float score() throws IOException; + + /** Skips to the first match beyond the current whose document number is + * greater than or equal to target.

Returns true iff there is such + * a match.

Behaves as if written:

+   *   boolean skipTo(int target) {
+   *     do {
+   *       if (!next())
+   * 	     return false;
+   *     } while (target > doc());
+   *     return true;
+   *   }
+   * 
+ * Most implementations are considerably more efficient than that. */ - public abstract void score(HitCollector hc, int maxDoc) throws IOException; + public abstract boolean skipTo(int target) throws IOException; /** Returns an explanation of the score for doc. */ public abstract Explanation explain(int doc) throws IOException; diff --git a/src/java/org/apache/lucene/search/TermScorer.java b/src/java/org/apache/lucene/search/TermScorer.java index d14d8f76d20..8a32b4bfdd3 100644 --- a/src/java/org/apache/lucene/search/TermScorer.java +++ b/src/java/org/apache/lucene/search/TermScorer.java @@ -83,44 +83,56 @@ final class TermScorer extends Scorer { for (int i = 0; i < SCORE_CACHE_SIZE; i++) scoreCache[i] = getSimilarity().tf(i) * weightValue; - - pointerMax = termDocs.read(docs, freqs); // fill buffers - - if (pointerMax != 0) - doc = docs[0]; - else { - termDocs.close(); // close stream - doc = Integer.MAX_VALUE; // set to sentinel value - } } - public final void score(HitCollector c, final int end) throws IOException { - int d = doc; // cache doc in local - Similarity similarity = getSimilarity(); // cache sim in local - while (d < end) { // for docs in window - final int f = freqs[pointer]; - float score = // compute tf(f)*weight - f < SCORE_CACHE_SIZE // check cache - ? scoreCache[f] // cache hit - : similarity.tf(f)*weightValue; // cache miss + public int doc() { return doc; } - score *= Similarity.decodeNorm(norms[d]); // normalize for field + public boolean next() throws IOException { + pointer++; + if (pointer >= pointerMax) { + pointerMax = termDocs.read(docs, freqs); // refill buffer + if (pointerMax != 0) { + pointer = 0; + } else { + termDocs.close(); // close stream + doc = Integer.MAX_VALUE; // set to sentinel value + return false; + } + } + doc = docs[pointer]; + return true; + } - c.collect(d, score); // collect score + public float score() throws IOException { + int f = freqs[pointer]; + float raw = // compute tf(f)*weight + f < SCORE_CACHE_SIZE // check cache + ? scoreCache[f] // cache hit + : getSimilarity().tf(f)*weightValue; // cache miss - if (++pointer == pointerMax) { - pointerMax = termDocs.read(docs, freqs); // refill buffers - if (pointerMax != 0) { - pointer = 0; - } else { - termDocs.close(); // close stream - doc = Integer.MAX_VALUE; // set to sentinel value - return; - } - } - d = docs[pointer]; + return raw * Similarity.decodeNorm(norms[doc]); // normalize for field + } + + public boolean skipTo(int target) throws IOException { + // first scan in cache + for (pointer++; pointer < pointerMax; pointer++) { + if (!(target > docs[pointer])) { + doc = docs[pointer]; + return true; + } } - doc = d; // flush cache + + // not found in cache, seek underlying stream + boolean result = termDocs.skipTo(target); + if (result) { + pointerMax = 1; + pointer = 0; + docs[pointer] = doc = termDocs.doc(); + freqs[pointer] = termDocs.freq(); + } else { + doc = Integer.MAX_VALUE; + } + return result; } public Explanation explain(int doc) throws IOException { diff --git a/src/java/org/apache/lucene/store/RAMDirectory.java b/src/java/org/apache/lucene/store/RAMDirectory.java index 0faaff869e1..c0d7c1029de 100644 --- a/src/java/org/apache/lucene/store/RAMDirectory.java +++ b/src/java/org/apache/lucene/store/RAMDirectory.java @@ -226,98 +226,3 @@ public final class RAMDirectory extends Directory { public final void close() { } } - - -final class RAMInputStream extends InputStream implements Cloneable { - RAMFile file; - int pointer = 0; - - public RAMInputStream(RAMFile f) { - file = f; - length = file.length; - } - - /** InputStream methods */ - public final void readInternal(byte[] dest, int destOffset, int len) { - int remainder = len; - int start = pointer; - while (remainder != 0) { - int bufferNumber = start/InputStream.BUFFER_SIZE; - int bufferOffset = start%InputStream.BUFFER_SIZE; - int bytesInBuffer = InputStream.BUFFER_SIZE - bufferOffset; - int bytesToCopy = bytesInBuffer >= remainder ? remainder : bytesInBuffer; - byte[] buffer = (byte[])file.buffers.elementAt(bufferNumber); - System.arraycopy(buffer, bufferOffset, dest, destOffset, bytesToCopy); - destOffset += bytesToCopy; - start += bytesToCopy; - remainder -= bytesToCopy; - } - pointer += len; - } - - public final void close() { - } - - /** Random-access methods */ - public final void seekInternal(long pos) { - pointer = (int)pos; - } -} - - -final class RAMOutputStream extends OutputStream { - RAMFile file; - int pointer = 0; - - public RAMOutputStream(RAMFile f) { - file = f; - } - - /** output methods: */ - public final void flushBuffer(byte[] src, int len) { - int bufferNumber = pointer/OutputStream.BUFFER_SIZE; - int bufferOffset = pointer%OutputStream.BUFFER_SIZE; - int bytesInBuffer = OutputStream.BUFFER_SIZE - bufferOffset; - int bytesToCopy = bytesInBuffer >= len ? len : bytesInBuffer; - - if (bufferNumber == file.buffers.size()) - file.buffers.addElement(new byte[OutputStream.BUFFER_SIZE]); - - byte[] buffer = (byte[])file.buffers.elementAt(bufferNumber); - System.arraycopy(src, 0, buffer, bufferOffset, bytesToCopy); - - if (bytesToCopy < len) { // not all in one buffer - int srcOffset = bytesToCopy; - bytesToCopy = len - bytesToCopy; // remaining bytes - bufferNumber++; - if (bufferNumber == file.buffers.size()) - file.buffers.addElement(new byte[OutputStream.BUFFER_SIZE]); - buffer = (byte[])file.buffers.elementAt(bufferNumber); - System.arraycopy(src, srcOffset, buffer, 0, bytesToCopy); - } - pointer += len; - if (pointer > file.length) - file.length = pointer; - - file.lastModified = System.currentTimeMillis(); - } - - public final void close() throws IOException { - super.close(); - } - - /** Random-access methods */ - public final void seek(long pos) throws IOException { - super.seek(pos); - pointer = (int)pos; - } - public final long length() throws IOException { - return file.length; - } -} - -final class RAMFile { - Vector buffers = new Vector(); - long length; - long lastModified = System.currentTimeMillis(); -} diff --git a/src/java/org/apache/lucene/store/RAMFile.java b/src/java/org/apache/lucene/store/RAMFile.java new file mode 100644 index 00000000000..c151e63c349 --- /dev/null +++ b/src/java/org/apache/lucene/store/RAMFile.java @@ -0,0 +1,63 @@ +package org.apache.lucene.store; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001, 2004 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import java.util.Vector; + +class RAMFile { + Vector buffers = new Vector(); + long length; + long lastModified = System.currentTimeMillis(); +} diff --git a/src/java/org/apache/lucene/store/RAMInputStream.java b/src/java/org/apache/lucene/store/RAMInputStream.java new file mode 100644 index 00000000000..b6038b68e46 --- /dev/null +++ b/src/java/org/apache/lucene/store/RAMInputStream.java @@ -0,0 +1,95 @@ +package org.apache.lucene.store; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001, 2004 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +/** + * A memory-resident {@link InputStream} implementation. + * + * @version $Id$ + */ + +class RAMInputStream extends InputStream implements Cloneable { + private RAMFile file; + private int pointer = 0; + + public RAMInputStream(RAMFile f) { + file = f; + length = file.length; + } + + public void readInternal(byte[] dest, int destOffset, int len) { + int remainder = len; + int start = pointer; + while (remainder != 0) { + int bufferNumber = start/BUFFER_SIZE; + int bufferOffset = start%BUFFER_SIZE; + int bytesInBuffer = BUFFER_SIZE - bufferOffset; + int bytesToCopy = bytesInBuffer >= remainder ? remainder : bytesInBuffer; + byte[] buffer = (byte[])file.buffers.elementAt(bufferNumber); + System.arraycopy(buffer, bufferOffset, dest, destOffset, bytesToCopy); + destOffset += bytesToCopy; + start += bytesToCopy; + remainder -= bytesToCopy; + } + pointer += len; + } + + public void close() { + } + + public void seekInternal(long pos) { + pointer = (int)pos; + } +} diff --git a/src/java/org/apache/lucene/store/RAMOutputStream.java b/src/java/org/apache/lucene/store/RAMOutputStream.java new file mode 100644 index 00000000000..7d4c5d5375e --- /dev/null +++ b/src/java/org/apache/lucene/store/RAMOutputStream.java @@ -0,0 +1,145 @@ +package org.apache.lucene.store; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001, 2004 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import java.io.IOException; + +/** + * A memory-resident {@link OutputStream} implementation. + * + * @version $Id$ + */ + +public class RAMOutputStream extends OutputStream { + private RAMFile file; + private int pointer = 0; + + /** Construct an empty output buffer. */ + public RAMOutputStream() { + this(new RAMFile()); + } + + RAMOutputStream(RAMFile f) { + file = f; + } + + /** Copy the current contents of this buffer to the named output. */ + public void writeTo(OutputStream out) throws IOException { + flush(); + final long end = file.length; + long pos = 0; + int buffer = 0; + while (pos < end) { + int length = BUFFER_SIZE; + long nextPos = pos + length; + if (nextPos > end) { // at the last buffer + length = (int)(end - pos); + } + out.writeBytes((byte[])file.buffers.elementAt(buffer++), length); + pos = nextPos; + } + } + + /** Resets this to an empty buffer. */ + public void reset() { + try { + seek(0); + } catch (IOException e) { // should never happen + throw new RuntimeException(e.toString()); + } + + file.length = 0; + } + + public void flushBuffer(byte[] src, int len) { + int bufferNumber = pointer/BUFFER_SIZE; + int bufferOffset = pointer%BUFFER_SIZE; + int bytesInBuffer = BUFFER_SIZE - bufferOffset; + int bytesToCopy = bytesInBuffer >= len ? len : bytesInBuffer; + + if (bufferNumber == file.buffers.size()) + file.buffers.addElement(new byte[BUFFER_SIZE]); + + byte[] buffer = (byte[])file.buffers.elementAt(bufferNumber); + System.arraycopy(src, 0, buffer, bufferOffset, bytesToCopy); + + if (bytesToCopy < len) { // not all in one buffer + int srcOffset = bytesToCopy; + bytesToCopy = len - bytesToCopy; // remaining bytes + bufferNumber++; + if (bufferNumber == file.buffers.size()) + file.buffers.addElement(new byte[BUFFER_SIZE]); + buffer = (byte[])file.buffers.elementAt(bufferNumber); + System.arraycopy(src, srcOffset, buffer, 0, bytesToCopy); + } + pointer += len; + if (pointer > file.length) + file.length = pointer; + + file.lastModified = System.currentTimeMillis(); + } + + public void close() throws IOException { + super.close(); + } + + public void seek(long pos) throws IOException { + super.seek(pos); + pointer = (int)pos; + } + public long length() { + return file.length; + } +} diff --git a/src/test/org/apache/lucene/ThreadSafetyTest.java b/src/test/org/apache/lucene/ThreadSafetyTest.java index 017c92e72a5..d3638697495 100644 --- a/src/test/org/apache/lucene/ThreadSafetyTest.java +++ b/src/test/org/apache/lucene/ThreadSafetyTest.java @@ -54,6 +54,7 @@ package org.apache.lucene; * . */ +import org.apache.lucene.util.*; import org.apache.lucene.store.*; import org.apache.lucene.document.*; import org.apache.lucene.analysis.*; @@ -93,7 +94,7 @@ class ThreadSafetyTest { Document d = new Document(); int n = RANDOM.nextInt(); d.add(Field.Keyword("id", Integer.toString(n))); - d.add(Field.UnStored("contents", intToEnglish(n))); + d.add(Field.UnStored("contents", English.intToEnglish(n))); System.out.println("Adding " + n); // Switch between single and multiple file segments @@ -151,7 +152,7 @@ class ThreadSafetyTest { throws Exception { System.out.println("Searching for " + n); Hits hits = - searcher.search(QueryParser.parse(intToEnglish(n), "contents", + searcher.search(QueryParser.parse(English.intToEnglish(n), "contents", ANALYZER)); System.out.println("Search for " + n + ": total=" + hits.length()); for (int j = 0; j < Math.min(3, hits.length()); j++) { @@ -197,76 +198,4 @@ class ThreadSafetyTest { SearcherThread searcherThread3 = new SearcherThread(true); searcherThread3.start(); } - - private static String intToEnglish(int i) { - StringBuffer result = new StringBuffer(); - intToEnglish(i, result); - return result.toString(); - } - - private static void intToEnglish(int i, StringBuffer result) { - if (i < 0) { - result.append("minus "); - i = -i; - } - if (i >= 1000000000) { // billions - intToEnglish(i/1000000000, result); - result.append("billion, "); - i = i%1000000000; - } - if (i >= 1000000) { // millions - intToEnglish(i/1000000, result); - result.append("million, "); - i = i%1000000; - } - if (i >= 1000) { // thousands - intToEnglish(i/1000, result); - result.append("thousand, "); - i = i%1000; - } - if (i >= 100) { // hundreds - intToEnglish(i/100, result); - result.append("hundred "); - i = i%100; - } - if (i >= 20) { - switch (i/10) { - case 9 : result.append("ninety"); break; - case 8 : result.append("eighty"); break; - case 7 : result.append("seventy"); break; - case 6 : result.append("sixty"); break; - case 5 : result.append("fifty"); break; - case 4 : result.append("forty"); break; - case 3 : result.append("thirty"); break; - case 2 : result.append("twenty"); break; - } - i = i%10; - if (i == 0) - result.append(" "); - else - result.append("-"); - } - switch (i) { - case 19 : result.append("nineteen "); break; - case 18 : result.append("eighteen "); break; - case 17 : result.append("seventeen "); break; - case 16 : result.append("sixteen "); break; - case 15 : result.append("fifteen "); break; - case 14 : result.append("fourteen "); break; - case 13 : result.append("thirteen "); break; - case 12 : result.append("twelve "); break; - case 11 : result.append("eleven "); break; - case 10 : result.append("ten "); break; - case 9 : result.append("nine "); break; - case 8 : result.append("eight "); break; - case 7 : result.append("seven "); break; - case 6 : result.append("six "); break; - case 5 : result.append("five "); break; - case 4 : result.append("four "); break; - case 3 : result.append("three "); break; - case 2 : result.append("two "); break; - case 1 : result.append("one "); break; - case 0 : result.append(""); break; - } - } } diff --git a/src/test/org/apache/lucene/search/TestBasics.java b/src/test/org/apache/lucene/search/TestBasics.java new file mode 100644 index 00000000000..21d6a5e943d --- /dev/null +++ b/src/test/org/apache/lucene/search/TestBasics.java @@ -0,0 +1,135 @@ +package org.apache.lucene.search; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001, 2004 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import junit.framework.TestCase; +import org.apache.lucene.util.English; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; + +/** + * Tests basic search capabilities. + * + * @author Doug Cutting + */ +public class TestBasics extends TestCase { + private IndexSearcher searcher; + + public void setUp() throws Exception { + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer + = new IndexWriter(directory, new SimpleAnalyzer(), true); + //writer.infoStream = System.out; + StringBuffer buffer = new StringBuffer(); + for (int i = 0; i < 1000; i++) { + Document doc = new Document(); + doc.add(Field.Text("field", English.intToEnglish(i))); + writer.addDocument(doc); + } + + writer.close(); + + searcher = new IndexSearcher(directory); + } + + public void testTerm() throws Exception { + Query query = new TermQuery(new Term("field", "seventy")); + Hits hits = searcher.search(query); + assertEquals(100, hits.length()); + } + + public void testTerm2() throws Exception { + Query query = new TermQuery(new Term("field", "seventish")); + Hits hits = searcher.search(query); + assertEquals(0, hits.length()); + } + + public void testPhrase() throws Exception { + PhraseQuery query = new PhraseQuery(); + query.add(new Term("field", "seventy")); + query.add(new Term("field", "seven")); + Hits hits = searcher.search(query); + assertEquals(10, hits.length()); + } + + public void testPhrase2() throws Exception { + PhraseQuery query = new PhraseQuery(); + query.add(new Term("field", "seventish")); + query.add(new Term("field", "sevenon")); + Hits hits = searcher.search(query); + assertEquals(0, hits.length()); + } + + public void testBoolean() throws Exception { + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term("field", "seventy")), true, false); + query.add(new TermQuery(new Term("field", "seven")), true, false); + Hits hits = searcher.search(query); + assertEquals(19, hits.length()); + } + + public void testBoolean2() throws Exception { + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term("field", "sevento")), true, false); + query.add(new TermQuery(new Term("field", "sevenly")), true, false); + Hits hits = searcher.search(query); + assertEquals(0, hits.length()); + } + +} diff --git a/src/test/org/apache/lucene/util/English.java b/src/test/org/apache/lucene/util/English.java new file mode 100644 index 00000000000..1072d54b394 --- /dev/null +++ b/src/test/org/apache/lucene/util/English.java @@ -0,0 +1,140 @@ +package org.apache.lucene.util; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001, 2004 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + + +public class English { + + public static String intToEnglish(int i) { + StringBuffer result = new StringBuffer(); + intToEnglish(i, result); + return result.toString(); + } + + public static void intToEnglish(int i, StringBuffer result) { + if (i == 0) { + result.append("zero"); + return; + } + if (i < 0) { + result.append("minus "); + i = -i; + } + if (i >= 1000000000) { // billions + intToEnglish(i/1000000000, result); + result.append("billion, "); + i = i%1000000000; + } + if (i >= 1000000) { // millions + intToEnglish(i/1000000, result); + result.append("million, "); + i = i%1000000; + } + if (i >= 1000) { // thousands + intToEnglish(i/1000, result); + result.append("thousand, "); + i = i%1000; + } + if (i >= 100) { // hundreds + intToEnglish(i/100, result); + result.append("hundred "); + i = i%100; + } + if (i >= 20) { + switch (i/10) { + case 9 : result.append("ninety"); break; + case 8 : result.append("eighty"); break; + case 7 : result.append("seventy"); break; + case 6 : result.append("sixty"); break; + case 5 : result.append("fifty"); break; + case 4 : result.append("forty"); break; + case 3 : result.append("thirty"); break; + case 2 : result.append("twenty"); break; + } + i = i%10; + if (i == 0) + result.append(" "); + else + result.append("-"); + } + switch (i) { + case 19 : result.append("nineteen "); break; + case 18 : result.append("eighteen "); break; + case 17 : result.append("seventeen "); break; + case 16 : result.append("sixteen "); break; + case 15 : result.append("fifteen "); break; + case 14 : result.append("fourteen "); break; + case 13 : result.append("thirteen "); break; + case 12 : result.append("twelve "); break; + case 11 : result.append("eleven "); break; + case 10 : result.append("ten "); break; + case 9 : result.append("nine "); break; + case 8 : result.append("eight "); break; + case 7 : result.append("seven "); break; + case 6 : result.append("six "); break; + case 5 : result.append("five "); break; + case 4 : result.append("four "); break; + case 3 : result.append("three "); break; + case 2 : result.append("two "); break; + case 1 : result.append("one "); break; + case 0 : result.append(""); break; + } + } + + public static void main(String[] args) { + System.out.println(intToEnglish(Integer.parseInt(args[0]))); + } + +}