- Initial commit

Submitted by: Tim Jones Reviewed by: Otis, Doug git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150182 13f79535-47bb-0310-9956-ffa450edef68
2004-01-30 16:22:33 +00:00 · 2004-01-30 16:22:33 +00:00 · b918871d54
parent ea247bfec3
commit b918871d54
2 changed files with 427 additions and 0 deletions
--- a/src/java/org/apache/lucene/search/FieldSortedHitQueue.java
+++ b/src/java/org/apache/lucene/search/FieldSortedHitQueue.java
@ -0,0 +1,206 @@
+package org.apache.lucene.search;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+
+import java.util.HashMap;
+import java.io.IOException;
+
+/**
+ * Expert: collects results from a search and sorts them by terms in a
+ * given field in each document.
+ *
+ * <p>In this version (0.1) the field to sort by must contain strictly
+ * String representations of Integers.
+ * See {@link SortedIndexSearcher SortedIndexSearcher} for more
+ * information.  Each document is assumed to have a single term in the
+ * given field, and the value of the term is the document's relative
+ * position in the given sort order.
+ *
+ * <p>When one of these objects is created, a TermEnumerator is
+ * created to fetch all the terms in the index for the given field.
+ * The value of each term is assumed to be an integer representing a
+ * sort position.  Each document is assumed to contain one of the
+ * terms, indicating where in the sort it belongs.
+ *
+ * <p><h3>Memory Usage</h3>
+ *
+ * <p>A static cache is maintained.  This cache contains an integer
+ * array of length <code>IndexReader.maxDoc()</code> for each field
+ * name for which a sort is performed.  In other words, the size of
+ * the cache in bytes is:
+ *
+ * <p><code>4 * IndexReader.maxDoc() * (# of different fields actually used to sort)</code>
+ *
+ * <p>Note that the size of the cache is not affected by how many
+ * fields are in the index and <i>might</i> be used to sort - only by
+ * the ones actually used to sort a result set.
+ *
+ * <p>The cache is cleared each time a new <code>IndexReader</code> is
+ * passed in, or if the value returned by <code>maxDoc()</code>
+ * changes for the current IndexReader.  This class is not set up to
+ * be able to efficiently sort hits from more than one index
+ * simultaneously.
+ *
+ * <p>Created: Dec 8, 2003 12:56:03 PM
+ *
+ * @author  "Tim Jones" &lt;tjluc@nacimiento.com&gt;
+ * @since   lucene 1.3
+ * @version 0.1
+ */
+public class FieldSortedHitQueue
+extends PriorityQueue {
+
+    /**
+     * Keeps track of the IndexReader which the cache
+     * applies to.  If it changes, the cache is cleared.
+     * We only store the hashcode so as not to mess up
+     * garbage collection by having a reference to an
+     * IndexReader.
+     */
+    protected static int lastReaderHash;
+
+    /**
+     * Contains the cache of sort information.  The
+     * key is field name, the value an array of int.
+     * A HashMap is used, and we are careful how we
+     * handle synchronization.  This is because best
+     * performance is obtained when the same IndexReader
+     * is used over and over, and we therefore perform
+     * many reads and few writes.
+     */
+    protected static HashMap fieldCache;
+
+    /** The sort information being used by this instance */
+    protected int[] fieldOrder;
+
+    /**
+     * Creates a hit queue sorted by the given field.
+     * @param reader  IndexReader to use.
+     * @param integer_field  Field to sort by.
+     * @param size    Number of hits to return - see {@link PriorityQueue#initialize(int) initialize}
+     * @throws IOException  If the internal term enumerator fails.
+     */
+    public FieldSortedHitQueue (IndexReader reader, String integer_field, int size)
+    throws IOException {
+
+        int hash = reader.hashCode();
+        if (hash != lastReaderHash) {
+            lastReaderHash = hash;
+            if (fieldCache != null) {
+                fieldCache.clear();
+            }
+            fieldCache = new HashMap();
+        }
+
+        initialize (size);
+        initializeSort (reader, integer_field);
+    }
+
+    /**
+     * Compares documents based on the value of the term in the field
+     * being sorted by.  Documents which should appear at the top of the
+     * list should have low values in the term; documents which should
+     * appear at the end should have high values.
+     *
+     * <p>In the context of this method, "less than" means "less relevant",
+     * so documents at the top of the list are "greatest" and documents at
+     * the bottom are "least".
+     *
+     * <p>Document A is considered less than Document B
+     * if A.field.term > B.field.term or A.doc > B.doc.
+     *
+     * @param a  ScoreDoc object for document a.
+     * @param b  ScoreDoc object for document b.
+     * @return true if document a is less than document b.
+     * @see ScoreDoc
+     */
+    protected final boolean lessThan (Object a, Object b) {
+        ScoreDoc hitA = (ScoreDoc) a;
+        ScoreDoc hitB = (ScoreDoc) b;
+        int scoreA = fieldOrder[hitA.doc];
+        int scoreB = fieldOrder[hitB.doc];
+        if (scoreA == scoreB)
+            return hitA.doc > hitB.doc;
+        else
+            return scoreA > scoreB;   // bigger is really less - the ones at the top should be the lowest
+    }
+
+    /**
+     * Initializes the cache of sort information.  <code>fieldCache</code> is queried
+     * to see if it has the term information for the given field.
+     * If so, and if the reader still has the same value for maxDoc()
+     * (note that we assume new IndexReaders are caught during the
+     * constructor), the existing data is used.  If not, all the term values
+     * for the given field are fetched.  The value of the term is assumed
+     * to be the sort index for any documents containing the term.  Documents
+     * should only have one term in the given field. Multiple documents
+     * can share the same term if desired (documents with the same term will
+     * be sorted relative to each other by the order they were placed in
+     * the index).
+     * @param reader  The document index.
+     * @param field   The field to sort by.
+     * @throws IOException  If the term enumerator fails.
+     */
+    protected final void initializeSort (IndexReader reader, String field)
+    throws IOException {
+
+        fieldOrder = (int[]) fieldCache.get (field);
+        if (fieldOrder == null || fieldOrder.length != reader.maxDoc()) {
+            fieldOrder = new int [reader.maxDoc()];
+
+            TermEnum enumerator = reader.terms (new Term (field, ""));
+            TermDocs termDocs = reader.termDocs();
+            if (enumerator.term() == null) {
+                throw new RuntimeException ("no terms in field "+field);
+            }
+
+            try {
+                Term term = enumerator.term();
+                while (term.field() == field) {
+                    termDocs.seek (term);
+                    if (termDocs.next()) {
+                        fieldOrder[termDocs.doc()] = Integer.parseInt (term.text());
+                    } else {
+                        throw new RuntimeException ("termDocs.next() failed!");
+                    }
+                    if (!enumerator.next()) {
+                        break;
+                    }
+                    term = enumerator.term();
+                }
+            } finally {
+                enumerator.close();
+                termDocs.close();
+            }
+
+            // be careful how the cache is updated so we
+            // don't have synchronization problems.  we do
+            // it this way because we assume updates will be
+            // few compared to the number of reads.
+            HashMap newCache = (HashMap) fieldCache.clone();
+            newCache.put (field, fieldOrder);
+            fieldCache = newCache;
+        }
+    }
+}
--- a/src/java/org/apache/lucene/search/IntegerSortedSearcher.java
+++ b/src/java/org/apache/lucene/search/IntegerSortedSearcher.java
@ -0,0 +1,221 @@
+package org.apache.lucene.search;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.*;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.search.*;
+import org.apache.lucene.search.TopDocs;
+
+import java.io.IOException;
+import java.util.BitSet;
+
+/**
+ * Implements search over an IndexReader using the values of terms in
+ * a field as the primary sort order.  Secondary sort is by the order
+ * of documents in the index.
+ *
+ * <p>In this version (0.1) the field to sort by must contain strictly
+ * String representations of Integers (i.e. {@link Integer#toString Integer.toString()}).
+ *
+ * Each document is assumed to have a single term in the given field,
+ * and the value of the term is the document's relative position in
+ * the given sort order.  The field must be indexed, but should not be
+ * stored or tokenized:
+ *
+ * <p><code>document.add(new Field("byAlpha", Integer.toString(x), false, true, false));</code>
+ *
+ * <p>In other words, the desired order of documents must be encoded
+ * at the time they are entered into the index.  The first document
+ * should have a low value integer, the last document a high value
+ * (i.e. the documents should be numbered <code>1..n</code> where
+ * <code>1</code> is the first and <code>n</code> the last).  Values
+ * must be between <code>Integer.MIN_VALUE</code> and
+ * <code>Integer.MAX_VALUE</code> inclusive.
+ *
+ * <p>Then, at search time, the field is designated to be used to sort
+ * the returned hits:
+ *
+ * <p><code>IndexSearcher searcher = new IntegerSortedSearcher(indexReader, "byAlpha");</code>
+ *
+ * <p>or:
+ *
+ * <p><code>IntegerSortedSearcher searcher = new IntegerSortedSearcher(indexReader, "bySomething");
+ * <br>Hits hits = searcher.search(query, filter);
+ * <br>...
+ * <br>searcher.setOrderByField("bySomethingElse");
+ * <br>hits = searcher.search(query, filter);
+ * <br>...
+ * </code>
+ *
+ * <p>Note the above example shows that one of these objects can be
+ * used multiple times, and the sort order changed between usages.
+ *
+ * <p><h3>Memory Usage</h3>
+ *
+ * <p>This object is almost identical to the regular IndexSearcher and
+ * makes no additional memory requirements on its own.  Every time the
+ * <code>search()</code> method is called, however, a new
+ * {@link FieldSortedHitQueue FieldSortedHitQueue} object is created.
+ * That object is responsible for putting the hits in the correct order,
+ * and it maintains a cache of information based on the IndexReader
+ * given to it.  See its documentation for more information on its
+ * memory usage.
+ *
+ * <p><h3>Concurrency</h3>
+ *
+ * <p>This object has the same behavior during concurrent updates to
+ * the index as does IndexSearcher.  Namely, in the default
+ * implementation using
+ * {@link org.apache.lucene.store.FSDirectory FSDirectory}, the index
+ * can be updated (deletes, adds) without harm while this object
+ * exists, but this object will not see the changes.  Ultimately this
+ * behavior is a result of the
+ * {@link org.apache.lucene.index.SegmentReader SegmentReader} class
+ * internal to FSDirectory, which caches information about documents
+ * in memory.
+ *
+ * <p>So, in order for IntegerSortedSearcher to be kept up to date with
+ * changes to the index, new instances must be created instead of the
+ * same one used over and over again.  This will result in lower
+ * performance than if instances are reused.
+ *
+ * <p><h3>Updates</h3>
+ *
+ * <p>In order to be able to update the index without having to
+ * recalculate all the sort numbers, the numbers should be stored with
+ * "space" between them.  That is, sort the documents and number them
+ * <code>1..n</code>.  Then, as <code>i</code> goes between
+ * <code>1</code> and <code>n</code>:
+ *
+ * <p><code>document.add(new Field("byAlpha", Integer.toString(i*1000), false, true, false));</code>
+ *
+ * <p>Add a new document sorted between position 1 and 2 by:
+ *
+ * <p><code>document.add(new Field("byAlpha", Integer.toString(1500), false, true, false));</code>
+ *
+ * <p>Be careful not to overun <code>Integer.MAX_VALUE</code>
+ * (<code>2147483647</code>).  Periodically a complete reindex should
+ * be run so the sort orders can be "normalized".
+ *
+ * <p>Created: Dec 8, 2003 12:47:26 PM
+ *
+ * @author  "Tim Jones" &lt;tjluc@nacimiento.com&gt;
+ * @since   lucene 1.3
+ * @version 0.1
+ * @see IndexSearcher
+ */
+public class IntegerSortedSearcher
+extends IndexSearcher {
+
+    /** stores the field being used to sort by **/
+    protected String field;
+
+    /**
+     * Searches the index in the named directory using the given
+     * field as the primary sort.
+     * The terms in the field must contain strictly integers in
+     * the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
+     * @see IndexSearcher(java.lang.String,java.lang.String)
+     */
+    public IntegerSortedSearcher(String path, String integer_field)
+    throws IOException {
+        this(IndexReader.open(path), integer_field);
+    }
+
+    /**
+     * Searches the index in the provided directory using the
+     * given field as the primary sort.
+     * The terms in the field must contain strictly integers in
+     * the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
+     * @see IndexSearcher(Directory,java.lang.String)
+     */
+    public IntegerSortedSearcher(Directory directory, String integer_field)
+    throws IOException {
+        this(IndexReader.open(directory), integer_field);
+    }
+
+    /**
+     * Searches the provided index using the given field as the
+     * primary sort.
+     * The terms in the field must contain strictly integers in
+     * the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
+     * @see IndexSearcher(IndexReader)
+     */
+    public IntegerSortedSearcher(IndexReader r, String integer_field) {
+        super(r);
+        this.field = integer_field.intern();
+    }
+
+    /**
+     * Sets the field to order results by.  This can be called
+     * multiple times per instance of IntegerSortedSearcher.
+     * @param integer_field  The field to sort results by.
+     */
+    public void setOrderByField(String integer_field) {
+        this.field = integer_field.intern();
+    }
+
+    /**
+     * Returns the name of the field currently being used
+     * to sort results by.
+     * @return  Field name.
+     */
+    public String getOrderByField() {
+        return field;
+    }
+
+
+    /**
+     * Finds the top <code>nDocs</code>
+     * hits for <code>query</code>, applying <code>filter</code> if non-null.
+     *
+     * Overrides IndexSearcher.search to use a FieldSortedHitQueue instead of the
+     * default HitQueue.
+     *
+     * @see IndexSearcher#search
+     */
+    public TopDocs search(Query query, Filter filter, final int nDocs)
+    throws IOException {
+
+        Scorer scorer = query.weight(this).scorer(reader);
+        if (scorer == null) {
+            return new TopDocs(0, new ScoreDoc[0]);
+        }
+
+        final BitSet bits = filter != null ? filter.bits(reader) : null;
+        final FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, field, nDocs);
+        final int[] totalHits = new int[1];
+        scorer.score(
+            new HitCollector() {
+                public final void collect(int doc, float score) {
+                    if (score > 0.0f &&                         // ignore zeroed buckets
+                        (bits == null || bits.get(doc))) {      // skip docs not in bits
+                        totalHits[0]++;
+                        hq.insert(new ScoreDoc(doc, score));
+                    }
+                }
+            });
+
+        ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
+        for (int i = hq.size() - 1; i >= 0; i--) {              // put docs in array
+            scoreDocs[i] = (ScoreDoc) hq.pop();
+        }
+
+        return new TopDocs(totalHits[0], scoreDocs);
+    }
+}