- Initial commit

Submitted by: Tim Jones
Reviewed by: Otis, Doug


git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150182 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Otis Gospodnetic 2004-01-30 16:22:33 +00:00
parent ea247bfec3
commit b918871d54
2 changed files with 427 additions and 0 deletions

View File

@ -0,0 +1,206 @@
package org.apache.lucene.search;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import java.util.HashMap;
import java.io.IOException;
/**
* Expert: collects results from a search and sorts them by terms in a
* given field in each document.
*
* <p>In this version (0.1) the field to sort by must contain strictly
* String representations of Integers.
* See {@link SortedIndexSearcher SortedIndexSearcher} for more
* information. Each document is assumed to have a single term in the
* given field, and the value of the term is the document's relative
* position in the given sort order.
*
* <p>When one of these objects is created, a TermEnumerator is
* created to fetch all the terms in the index for the given field.
* The value of each term is assumed to be an integer representing a
* sort position. Each document is assumed to contain one of the
* terms, indicating where in the sort it belongs.
*
* <p><h3>Memory Usage</h3>
*
* <p>A static cache is maintained. This cache contains an integer
* array of length <code>IndexReader.maxDoc()</code> for each field
* name for which a sort is performed. In other words, the size of
* the cache in bytes is:
*
* <p><code>4 * IndexReader.maxDoc() * (# of different fields actually used to sort)</code>
*
* <p>Note that the size of the cache is not affected by how many
* fields are in the index and <i>might</i> be used to sort - only by
* the ones actually used to sort a result set.
*
* <p>The cache is cleared each time a new <code>IndexReader</code> is
* passed in, or if the value returned by <code>maxDoc()</code>
* changes for the current IndexReader. This class is not set up to
* be able to efficiently sort hits from more than one index
* simultaneously.
*
* <p>Created: Dec 8, 2003 12:56:03 PM
*
* @author "Tim Jones" &lt;tjluc@nacimiento.com&gt;
* @since lucene 1.3
* @version 0.1
*/
public class FieldSortedHitQueue
extends PriorityQueue {
/**
* Keeps track of the IndexReader which the cache
* applies to. If it changes, the cache is cleared.
* We only store the hashcode so as not to mess up
* garbage collection by having a reference to an
* IndexReader.
*/
protected static int lastReaderHash;
/**
* Contains the cache of sort information. The
* key is field name, the value an array of int.
* A HashMap is used, and we are careful how we
* handle synchronization. This is because best
* performance is obtained when the same IndexReader
* is used over and over, and we therefore perform
* many reads and few writes.
*/
protected static HashMap fieldCache;
/** The sort information being used by this instance */
protected int[] fieldOrder;
/**
* Creates a hit queue sorted by the given field.
* @param reader IndexReader to use.
* @param integer_field Field to sort by.
* @param size Number of hits to return - see {@link PriorityQueue#initialize(int) initialize}
* @throws IOException If the internal term enumerator fails.
*/
public FieldSortedHitQueue (IndexReader reader, String integer_field, int size)
throws IOException {
int hash = reader.hashCode();
if (hash != lastReaderHash) {
lastReaderHash = hash;
if (fieldCache != null) {
fieldCache.clear();
}
fieldCache = new HashMap();
}
initialize (size);
initializeSort (reader, integer_field);
}
/**
* Compares documents based on the value of the term in the field
* being sorted by. Documents which should appear at the top of the
* list should have low values in the term; documents which should
* appear at the end should have high values.
*
* <p>In the context of this method, "less than" means "less relevant",
* so documents at the top of the list are "greatest" and documents at
* the bottom are "least".
*
* <p>Document A is considered less than Document B
* if A.field.term > B.field.term or A.doc > B.doc.
*
* @param a ScoreDoc object for document a.
* @param b ScoreDoc object for document b.
* @return true if document a is less than document b.
* @see ScoreDoc
*/
protected final boolean lessThan (Object a, Object b) {
ScoreDoc hitA = (ScoreDoc) a;
ScoreDoc hitB = (ScoreDoc) b;
int scoreA = fieldOrder[hitA.doc];
int scoreB = fieldOrder[hitB.doc];
if (scoreA == scoreB)
return hitA.doc > hitB.doc;
else
return scoreA > scoreB; // bigger is really less - the ones at the top should be the lowest
}
/**
* Initializes the cache of sort information. <code>fieldCache</code> is queried
* to see if it has the term information for the given field.
* If so, and if the reader still has the same value for maxDoc()
* (note that we assume new IndexReaders are caught during the
* constructor), the existing data is used. If not, all the term values
* for the given field are fetched. The value of the term is assumed
* to be the sort index for any documents containing the term. Documents
* should only have one term in the given field. Multiple documents
* can share the same term if desired (documents with the same term will
* be sorted relative to each other by the order they were placed in
* the index).
* @param reader The document index.
* @param field The field to sort by.
* @throws IOException If the term enumerator fails.
*/
protected final void initializeSort (IndexReader reader, String field)
throws IOException {
fieldOrder = (int[]) fieldCache.get (field);
if (fieldOrder == null || fieldOrder.length != reader.maxDoc()) {
fieldOrder = new int [reader.maxDoc()];
TermEnum enumerator = reader.terms (new Term (field, ""));
TermDocs termDocs = reader.termDocs();
if (enumerator.term() == null) {
throw new RuntimeException ("no terms in field "+field);
}
try {
Term term = enumerator.term();
while (term.field() == field) {
termDocs.seek (term);
if (termDocs.next()) {
fieldOrder[termDocs.doc()] = Integer.parseInt (term.text());
} else {
throw new RuntimeException ("termDocs.next() failed!");
}
if (!enumerator.next()) {
break;
}
term = enumerator.term();
}
} finally {
enumerator.close();
termDocs.close();
}
// be careful how the cache is updated so we
// don't have synchronization problems. we do
// it this way because we assume updates will be
// few compared to the number of reads.
HashMap newCache = (HashMap) fieldCache.clone();
newCache.put (field, fieldOrder);
fieldCache = newCache;
}
}
}

View File

@ -0,0 +1,221 @@
package org.apache.lucene.search;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.search.*;
import org.apache.lucene.search.TopDocs;
import java.io.IOException;
import java.util.BitSet;
/**
* Implements search over an IndexReader using the values of terms in
* a field as the primary sort order. Secondary sort is by the order
* of documents in the index.
*
* <p>In this version (0.1) the field to sort by must contain strictly
* String representations of Integers (i.e. {@link Integer#toString Integer.toString()}).
*
* Each document is assumed to have a single term in the given field,
* and the value of the term is the document's relative position in
* the given sort order. The field must be indexed, but should not be
* stored or tokenized:
*
* <p><code>document.add(new Field("byAlpha", Integer.toString(x), false, true, false));</code>
*
* <p>In other words, the desired order of documents must be encoded
* at the time they are entered into the index. The first document
* should have a low value integer, the last document a high value
* (i.e. the documents should be numbered <code>1..n</code> where
* <code>1</code> is the first and <code>n</code> the last). Values
* must be between <code>Integer.MIN_VALUE</code> and
* <code>Integer.MAX_VALUE</code> inclusive.
*
* <p>Then, at search time, the field is designated to be used to sort
* the returned hits:
*
* <p><code>IndexSearcher searcher = new IntegerSortedSearcher(indexReader, "byAlpha");</code>
*
* <p>or:
*
* <p><code>IntegerSortedSearcher searcher = new IntegerSortedSearcher(indexReader, "bySomething");
* <br>Hits hits = searcher.search(query, filter);
* <br>...
* <br>searcher.setOrderByField("bySomethingElse");
* <br>hits = searcher.search(query, filter);
* <br>...
* </code>
*
* <p>Note the above example shows that one of these objects can be
* used multiple times, and the sort order changed between usages.
*
* <p><h3>Memory Usage</h3>
*
* <p>This object is almost identical to the regular IndexSearcher and
* makes no additional memory requirements on its own. Every time the
* <code>search()</code> method is called, however, a new
* {@link FieldSortedHitQueue FieldSortedHitQueue} object is created.
* That object is responsible for putting the hits in the correct order,
* and it maintains a cache of information based on the IndexReader
* given to it. See its documentation for more information on its
* memory usage.
*
* <p><h3>Concurrency</h3>
*
* <p>This object has the same behavior during concurrent updates to
* the index as does IndexSearcher. Namely, in the default
* implementation using
* {@link org.apache.lucene.store.FSDirectory FSDirectory}, the index
* can be updated (deletes, adds) without harm while this object
* exists, but this object will not see the changes. Ultimately this
* behavior is a result of the
* {@link org.apache.lucene.index.SegmentReader SegmentReader} class
* internal to FSDirectory, which caches information about documents
* in memory.
*
* <p>So, in order for IntegerSortedSearcher to be kept up to date with
* changes to the index, new instances must be created instead of the
* same one used over and over again. This will result in lower
* performance than if instances are reused.
*
* <p><h3>Updates</h3>
*
* <p>In order to be able to update the index without having to
* recalculate all the sort numbers, the numbers should be stored with
* "space" between them. That is, sort the documents and number them
* <code>1..n</code>. Then, as <code>i</code> goes between
* <code>1</code> and <code>n</code>:
*
* <p><code>document.add(new Field("byAlpha", Integer.toString(i*1000), false, true, false));</code>
*
* <p>Add a new document sorted between position 1 and 2 by:
*
* <p><code>document.add(new Field("byAlpha", Integer.toString(1500), false, true, false));</code>
*
* <p>Be careful not to overun <code>Integer.MAX_VALUE</code>
* (<code>2147483647</code>). Periodically a complete reindex should
* be run so the sort orders can be "normalized".
*
* <p>Created: Dec 8, 2003 12:47:26 PM
*
* @author "Tim Jones" &lt;tjluc@nacimiento.com&gt;
* @since lucene 1.3
* @version 0.1
* @see IndexSearcher
*/
public class IntegerSortedSearcher
extends IndexSearcher {
/** stores the field being used to sort by **/
protected String field;
/**
* Searches the index in the named directory using the given
* field as the primary sort.
* The terms in the field must contain strictly integers in
* the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
* @see IndexSearcher(java.lang.String,java.lang.String)
*/
public IntegerSortedSearcher(String path, String integer_field)
throws IOException {
this(IndexReader.open(path), integer_field);
}
/**
* Searches the index in the provided directory using the
* given field as the primary sort.
* The terms in the field must contain strictly integers in
* the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
* @see IndexSearcher(Directory,java.lang.String)
*/
public IntegerSortedSearcher(Directory directory, String integer_field)
throws IOException {
this(IndexReader.open(directory), integer_field);
}
/**
* Searches the provided index using the given field as the
* primary sort.
* The terms in the field must contain strictly integers in
* the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
* @see IndexSearcher(IndexReader)
*/
public IntegerSortedSearcher(IndexReader r, String integer_field) {
super(r);
this.field = integer_field.intern();
}
/**
* Sets the field to order results by. This can be called
* multiple times per instance of IntegerSortedSearcher.
* @param integer_field The field to sort results by.
*/
public void setOrderByField(String integer_field) {
this.field = integer_field.intern();
}
/**
* Returns the name of the field currently being used
* to sort results by.
* @return Field name.
*/
public String getOrderByField() {
return field;
}
/**
* Finds the top <code>nDocs</code>
* hits for <code>query</code>, applying <code>filter</code> if non-null.
*
* Overrides IndexSearcher.search to use a FieldSortedHitQueue instead of the
* default HitQueue.
*
* @see IndexSearcher#search
*/
public TopDocs search(Query query, Filter filter, final int nDocs)
throws IOException {
Scorer scorer = query.weight(this).scorer(reader);
if (scorer == null) {
return new TopDocs(0, new ScoreDoc[0]);
}
final BitSet bits = filter != null ? filter.bits(reader) : null;
final FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, field, nDocs);
final int[] totalHits = new int[1];
scorer.score(
new HitCollector() {
public final void collect(int doc, float score) {
if (score > 0.0f && // ignore zeroed buckets
(bits == null || bits.get(doc))) { // skip docs not in bits
totalHits[0]++;
hq.insert(new ScoreDoc(doc, score));
}
}
});
ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
for (int i = hq.size() - 1; i >= 0; i--) { // put docs in array
scoreDocs[i] = (ScoreDoc) hq.pop();
}
return new TopDocs(totalHits[0], scoreDocs);
}
}