- Initial commit

Submitted by: Tim Jones Reviewed by: Otis, Doug git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150182 13f79535-47bb-0310-9956-ffa450edef68
2004-01-30 16:22:33 +00:00 · 2004-01-30 16:22:33 +00:00 · b918871d54
parent ea247bfec3
commit b918871d54
2 changed files with 427 additions and 0 deletions
--- a/src/java/org/apache/lucene/search/FieldSortedHitQueue.java
+++ b/src/java/org/apache/lucene/search/FieldSortedHitQueue.java
@ -0,0 +1,206 @@
 package org.apache.lucene.search;
 /**
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.util.PriorityQueue;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermDocs;
 import java.util.HashMap;
 import java.io.IOException;
 /**
 * Expert: collects results from a search and sorts them by terms in a
 * given field in each document.
 *
 * <p>In this version (0.1) the field to sort by must contain strictly
 * String representations of Integers.
 * See {@link SortedIndexSearcher SortedIndexSearcher} for more
 * information.  Each document is assumed to have a single term in the
 * given field, and the value of the term is the document's relative
 * position in the given sort order.
 *
 * <p>When one of these objects is created, a TermEnumerator is
 * created to fetch all the terms in the index for the given field.
 * The value of each term is assumed to be an integer representing a
 * sort position.  Each document is assumed to contain one of the
 * terms, indicating where in the sort it belongs.
 *
 * <p><h3>Memory Usage</h3>
 *
 * <p>A static cache is maintained.  This cache contains an integer
 * array of length <code>IndexReader.maxDoc()</code> for each field
 * name for which a sort is performed.  In other words, the size of
 * the cache in bytes is:
 *
 * <p><code>4 * IndexReader.maxDoc() * (# of different fields actually used to sort)</code>
 *
 * <p>Note that the size of the cache is not affected by how many
 * fields are in the index and <i>might</i> be used to sort - only by
 * the ones actually used to sort a result set.
 *
 * <p>The cache is cleared each time a new <code>IndexReader</code> is
 * passed in, or if the value returned by <code>maxDoc()</code>
 * changes for the current IndexReader.  This class is not set up to
 * be able to efficiently sort hits from more than one index
 * simultaneously.
 *
 * <p>Created: Dec 8, 2003 12:56:03 PM
 *
 * @author  "Tim Jones" &lt;tjluc@nacimiento.com&gt;
 * @since   lucene 1.3
 * @version 0.1
 */
 public class FieldSortedHitQueue
 extends PriorityQueue {
    /**
     * Keeps track of the IndexReader which the cache
     * applies to.  If it changes, the cache is cleared.
     * We only store the hashcode so as not to mess up
     * garbage collection by having a reference to an
     * IndexReader.
     */
    protected static int lastReaderHash;
    /**
     * Contains the cache of sort information.  The
     * key is field name, the value an array of int.
     * A HashMap is used, and we are careful how we
     * handle synchronization.  This is because best
     * performance is obtained when the same IndexReader
     * is used over and over, and we therefore perform
     * many reads and few writes.
     */
    protected static HashMap fieldCache;
    /** The sort information being used by this instance */
    protected int[] fieldOrder;
    /**
     * Creates a hit queue sorted by the given field.
     * @param reader  IndexReader to use.
     * @param integer_field  Field to sort by.
     * @param size    Number of hits to return - see {@link PriorityQueue#initialize(int) initialize}
     * @throws IOException  If the internal term enumerator fails.
     */
    public FieldSortedHitQueue (IndexReader reader, String integer_field, int size)
    throws IOException {
        int hash = reader.hashCode();
        if (hash != lastReaderHash) {
            lastReaderHash = hash;
            if (fieldCache != null) {
                fieldCache.clear();
            }
            fieldCache = new HashMap();
        }
        initialize (size);
        initializeSort (reader, integer_field);
    }
    /**
     * Compares documents based on the value of the term in the field
     * being sorted by.  Documents which should appear at the top of the
     * list should have low values in the term; documents which should
     * appear at the end should have high values.
     *
     * <p>In the context of this method, "less than" means "less relevant",
     * so documents at the top of the list are "greatest" and documents at
     * the bottom are "least".
     *
     * <p>Document A is considered less than Document B
     * if A.field.term > B.field.term or A.doc > B.doc.
     *
     * @param a  ScoreDoc object for document a.
     * @param b  ScoreDoc object for document b.
     * @return true if document a is less than document b.
     * @see ScoreDoc
     */
    protected final boolean lessThan (Object a, Object b) {
        ScoreDoc hitA = (ScoreDoc) a;
        ScoreDoc hitB = (ScoreDoc) b;
        int scoreA = fieldOrder[hitA.doc];
        int scoreB = fieldOrder[hitB.doc];
        if (scoreA == scoreB)
            return hitA.doc > hitB.doc;
        else
            return scoreA > scoreB;   // bigger is really less - the ones at the top should be the lowest
    }
    /**
     * Initializes the cache of sort information.  <code>fieldCache</code> is queried
     * to see if it has the term information for the given field.
     * If so, and if the reader still has the same value for maxDoc()
     * (note that we assume new IndexReaders are caught during the
     * constructor), the existing data is used.  If not, all the term values
     * for the given field are fetched.  The value of the term is assumed
     * to be the sort index for any documents containing the term.  Documents
     * should only have one term in the given field. Multiple documents
     * can share the same term if desired (documents with the same term will
     * be sorted relative to each other by the order they were placed in
     * the index).
     * @param reader  The document index.
     * @param field   The field to sort by.
     * @throws IOException  If the term enumerator fails.
     */
    protected final void initializeSort (IndexReader reader, String field)
    throws IOException {
        fieldOrder = (int[]) fieldCache.get (field);
        if (fieldOrder == null || fieldOrder.length != reader.maxDoc()) {
            fieldOrder = new int [reader.maxDoc()];
            TermEnum enumerator = reader.terms (new Term (field, ""));
            TermDocs termDocs = reader.termDocs();
            if (enumerator.term() == null) {
                throw new RuntimeException ("no terms in field "+field);
            }
            try {
                Term term = enumerator.term();
                while (term.field() == field) {
                    termDocs.seek (term);
                    if (termDocs.next()) {
                        fieldOrder[termDocs.doc()] = Integer.parseInt (term.text());
                    } else {
                        throw new RuntimeException ("termDocs.next() failed!");
                    }
                    if (!enumerator.next()) {
                        break;
                    }
                    term = enumerator.term();
                }
            } finally {
                enumerator.close();
                termDocs.close();
            }
            // be careful how the cache is updated so we
            // don't have synchronization problems.  we do
            // it this way because we assume updates will be
            // few compared to the number of reads.
            HashMap newCache = (HashMap) fieldCache.clone();
            newCache.put (field, fieldOrder);
            fieldCache = newCache;
        }
    }
 }
--- a/src/java/org/apache/lucene/search/IntegerSortedSearcher.java
+++ b/src/java/org/apache/lucene/search/IntegerSortedSearcher.java
@ -0,0 +1,221 @@
 package org.apache.lucene.search;
 /**
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.index.*;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.search.*;
 import org.apache.lucene.search.TopDocs;
 import java.io.IOException;
 import java.util.BitSet;
 /**
 * Implements search over an IndexReader using the values of terms in
 * a field as the primary sort order.  Secondary sort is by the order
 * of documents in the index.
 *
 * <p>In this version (0.1) the field to sort by must contain strictly
 * String representations of Integers (i.e. {@link Integer#toString Integer.toString()}).
 *
 * Each document is assumed to have a single term in the given field,
 * and the value of the term is the document's relative position in
 * the given sort order.  The field must be indexed, but should not be
 * stored or tokenized:
 *
 * <p><code>document.add(new Field("byAlpha", Integer.toString(x), false, true, false));</code>
 *
 * <p>In other words, the desired order of documents must be encoded
 * at the time they are entered into the index.  The first document
 * should have a low value integer, the last document a high value
 * (i.e. the documents should be numbered <code>1..n</code> where
 * <code>1</code> is the first and <code>n</code> the last).  Values
 * must be between <code>Integer.MIN_VALUE</code> and
 * <code>Integer.MAX_VALUE</code> inclusive.
 *
 * <p>Then, at search time, the field is designated to be used to sort
 * the returned hits:
 *
 * <p><code>IndexSearcher searcher = new IntegerSortedSearcher(indexReader, "byAlpha");</code>
 *
 * <p>or:
 *
 * <p><code>IntegerSortedSearcher searcher = new IntegerSortedSearcher(indexReader, "bySomething");
 * <br>Hits hits = searcher.search(query, filter);
 * <br>...
 * <br>searcher.setOrderByField("bySomethingElse");
 * <br>hits = searcher.search(query, filter);
 * <br>...
 * </code>
 *
 * <p>Note the above example shows that one of these objects can be
 * used multiple times, and the sort order changed between usages.
 *
 * <p><h3>Memory Usage</h3>
 *
 * <p>This object is almost identical to the regular IndexSearcher and
 * makes no additional memory requirements on its own.  Every time the
 * <code>search()</code> method is called, however, a new
 * {@link FieldSortedHitQueue FieldSortedHitQueue} object is created.
 * That object is responsible for putting the hits in the correct order,
 * and it maintains a cache of information based on the IndexReader
 * given to it.  See its documentation for more information on its
 * memory usage.
 *
 * <p><h3>Concurrency</h3>
 *
 * <p>This object has the same behavior during concurrent updates to
 * the index as does IndexSearcher.  Namely, in the default
 * implementation using
 * {@link org.apache.lucene.store.FSDirectory FSDirectory}, the index
 * can be updated (deletes, adds) without harm while this object
 * exists, but this object will not see the changes.  Ultimately this
 * behavior is a result of the
 * {@link org.apache.lucene.index.SegmentReader SegmentReader} class
 * internal to FSDirectory, which caches information about documents
 * in memory.
 *
 * <p>So, in order for IntegerSortedSearcher to be kept up to date with
 * changes to the index, new instances must be created instead of the
 * same one used over and over again.  This will result in lower
 * performance than if instances are reused.
 *
 * <p><h3>Updates</h3>
 *
 * <p>In order to be able to update the index without having to
 * recalculate all the sort numbers, the numbers should be stored with
 * "space" between them.  That is, sort the documents and number them
 * <code>1..n</code>.  Then, as <code>i</code> goes between
 * <code>1</code> and <code>n</code>:
 *
 * <p><code>document.add(new Field("byAlpha", Integer.toString(i*1000), false, true, false));</code>
 *
 * <p>Add a new document sorted between position 1 and 2 by:
 *
 * <p><code>document.add(new Field("byAlpha", Integer.toString(1500), false, true, false));</code>
 *
 * <p>Be careful not to overun <code>Integer.MAX_VALUE</code>
 * (<code>2147483647</code>).  Periodically a complete reindex should
 * be run so the sort orders can be "normalized".
 *
 * <p>Created: Dec 8, 2003 12:47:26 PM
 *
 * @author  "Tim Jones" &lt;tjluc@nacimiento.com&gt;
 * @since   lucene 1.3
 * @version 0.1
 * @see IndexSearcher
 */
 public class IntegerSortedSearcher
 extends IndexSearcher {
    /** stores the field being used to sort by **/
    protected String field;
    /**
     * Searches the index in the named directory using the given
     * field as the primary sort.
     * The terms in the field must contain strictly integers in
     * the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
     * @see IndexSearcher(java.lang.String,java.lang.String)
     */
    public IntegerSortedSearcher(String path, String integer_field)
    throws IOException {
        this(IndexReader.open(path), integer_field);
    }
    /**
     * Searches the index in the provided directory using the
     * given field as the primary sort.
     * The terms in the field must contain strictly integers in
     * the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
     * @see IndexSearcher(Directory,java.lang.String)
     */
    public IntegerSortedSearcher(Directory directory, String integer_field)
    throws IOException {
        this(IndexReader.open(directory), integer_field);
    }
    /**
     * Searches the provided index using the given field as the
     * primary sort.
     * The terms in the field must contain strictly integers in
     * the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
     * @see IndexSearcher(IndexReader)
     */
    public IntegerSortedSearcher(IndexReader r, String integer_field) {
        super(r);
        this.field = integer_field.intern();
    }
    /**
     * Sets the field to order results by.  This can be called
     * multiple times per instance of IntegerSortedSearcher.
     * @param integer_field  The field to sort results by.
     */
    public void setOrderByField(String integer_field) {
        this.field = integer_field.intern();
    }
    /**
     * Returns the name of the field currently being used
     * to sort results by.
     * @return  Field name.
     */
    public String getOrderByField() {
        return field;
    }
    /**
     * Finds the top <code>nDocs</code>
     * hits for <code>query</code>, applying <code>filter</code> if non-null.
     *
     * Overrides IndexSearcher.search to use a FieldSortedHitQueue instead of the
     * default HitQueue.
     *
     * @see IndexSearcher#search
     */
    public TopDocs search(Query query, Filter filter, final int nDocs)
    throws IOException {
        Scorer scorer = query.weight(this).scorer(reader);
        if (scorer == null) {
            return new TopDocs(0, new ScoreDoc[0]);
        }
        final BitSet bits = filter != null ? filter.bits(reader) : null;
        final FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, field, nDocs);
        final int[] totalHits = new int[1];
        scorer.score(
            new HitCollector() {
                public final void collect(int doc, float score) {
                    if (score > 0.0f &&                         // ignore zeroed buckets
                        (bits == null || bits.get(doc))) {      // skip docs not in bits
                        totalHits[0]++;
                        hq.insert(new ScoreDoc(doc, score));
                    }
                }
            });
        ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
        for (int i = hq.size() - 1; i >= 0; i--) {              // put docs in array
            scoreDocs[i] = (ScoreDoc) hq.pop();
        }
        return new TopDocs(totalHits[0], scoreDocs);
    }
 }