mirror of https://github.com/apache/lucene.git
- Initial commit
Submitted by: Tim Jones Reviewed by: Otis, Doug git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150182 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ea247bfec3
commit
b918871d54
|
@ -0,0 +1,206 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Expert: collects results from a search and sorts them by terms in a
|
||||
* given field in each document.
|
||||
*
|
||||
* <p>In this version (0.1) the field to sort by must contain strictly
|
||||
* String representations of Integers.
|
||||
* See {@link SortedIndexSearcher SortedIndexSearcher} for more
|
||||
* information. Each document is assumed to have a single term in the
|
||||
* given field, and the value of the term is the document's relative
|
||||
* position in the given sort order.
|
||||
*
|
||||
* <p>When one of these objects is created, a TermEnumerator is
|
||||
* created to fetch all the terms in the index for the given field.
|
||||
* The value of each term is assumed to be an integer representing a
|
||||
* sort position. Each document is assumed to contain one of the
|
||||
* terms, indicating where in the sort it belongs.
|
||||
*
|
||||
* <p><h3>Memory Usage</h3>
|
||||
*
|
||||
* <p>A static cache is maintained. This cache contains an integer
|
||||
* array of length <code>IndexReader.maxDoc()</code> for each field
|
||||
* name for which a sort is performed. In other words, the size of
|
||||
* the cache in bytes is:
|
||||
*
|
||||
* <p><code>4 * IndexReader.maxDoc() * (# of different fields actually used to sort)</code>
|
||||
*
|
||||
* <p>Note that the size of the cache is not affected by how many
|
||||
* fields are in the index and <i>might</i> be used to sort - only by
|
||||
* the ones actually used to sort a result set.
|
||||
*
|
||||
* <p>The cache is cleared each time a new <code>IndexReader</code> is
|
||||
* passed in, or if the value returned by <code>maxDoc()</code>
|
||||
* changes for the current IndexReader. This class is not set up to
|
||||
* be able to efficiently sort hits from more than one index
|
||||
* simultaneously.
|
||||
*
|
||||
* <p>Created: Dec 8, 2003 12:56:03 PM
|
||||
*
|
||||
* @author "Tim Jones" <tjluc@nacimiento.com>
|
||||
* @since lucene 1.3
|
||||
* @version 0.1
|
||||
*/
|
||||
public class FieldSortedHitQueue
|
||||
extends PriorityQueue {
|
||||
|
||||
/**
|
||||
* Keeps track of the IndexReader which the cache
|
||||
* applies to. If it changes, the cache is cleared.
|
||||
* We only store the hashcode so as not to mess up
|
||||
* garbage collection by having a reference to an
|
||||
* IndexReader.
|
||||
*/
|
||||
protected static int lastReaderHash;
|
||||
|
||||
/**
|
||||
* Contains the cache of sort information. The
|
||||
* key is field name, the value an array of int.
|
||||
* A HashMap is used, and we are careful how we
|
||||
* handle synchronization. This is because best
|
||||
* performance is obtained when the same IndexReader
|
||||
* is used over and over, and we therefore perform
|
||||
* many reads and few writes.
|
||||
*/
|
||||
protected static HashMap fieldCache;
|
||||
|
||||
/** The sort information being used by this instance */
|
||||
protected int[] fieldOrder;
|
||||
|
||||
/**
|
||||
* Creates a hit queue sorted by the given field.
|
||||
* @param reader IndexReader to use.
|
||||
* @param integer_field Field to sort by.
|
||||
* @param size Number of hits to return - see {@link PriorityQueue#initialize(int) initialize}
|
||||
* @throws IOException If the internal term enumerator fails.
|
||||
*/
|
||||
public FieldSortedHitQueue (IndexReader reader, String integer_field, int size)
|
||||
throws IOException {
|
||||
|
||||
int hash = reader.hashCode();
|
||||
if (hash != lastReaderHash) {
|
||||
lastReaderHash = hash;
|
||||
if (fieldCache != null) {
|
||||
fieldCache.clear();
|
||||
}
|
||||
fieldCache = new HashMap();
|
||||
}
|
||||
|
||||
initialize (size);
|
||||
initializeSort (reader, integer_field);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares documents based on the value of the term in the field
|
||||
* being sorted by. Documents which should appear at the top of the
|
||||
* list should have low values in the term; documents which should
|
||||
* appear at the end should have high values.
|
||||
*
|
||||
* <p>In the context of this method, "less than" means "less relevant",
|
||||
* so documents at the top of the list are "greatest" and documents at
|
||||
* the bottom are "least".
|
||||
*
|
||||
* <p>Document A is considered less than Document B
|
||||
* if A.field.term > B.field.term or A.doc > B.doc.
|
||||
*
|
||||
* @param a ScoreDoc object for document a.
|
||||
* @param b ScoreDoc object for document b.
|
||||
* @return true if document a is less than document b.
|
||||
* @see ScoreDoc
|
||||
*/
|
||||
protected final boolean lessThan (Object a, Object b) {
|
||||
ScoreDoc hitA = (ScoreDoc) a;
|
||||
ScoreDoc hitB = (ScoreDoc) b;
|
||||
int scoreA = fieldOrder[hitA.doc];
|
||||
int scoreB = fieldOrder[hitB.doc];
|
||||
if (scoreA == scoreB)
|
||||
return hitA.doc > hitB.doc;
|
||||
else
|
||||
return scoreA > scoreB; // bigger is really less - the ones at the top should be the lowest
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the cache of sort information. <code>fieldCache</code> is queried
|
||||
* to see if it has the term information for the given field.
|
||||
* If so, and if the reader still has the same value for maxDoc()
|
||||
* (note that we assume new IndexReaders are caught during the
|
||||
* constructor), the existing data is used. If not, all the term values
|
||||
* for the given field are fetched. The value of the term is assumed
|
||||
* to be the sort index for any documents containing the term. Documents
|
||||
* should only have one term in the given field. Multiple documents
|
||||
* can share the same term if desired (documents with the same term will
|
||||
* be sorted relative to each other by the order they were placed in
|
||||
* the index).
|
||||
* @param reader The document index.
|
||||
* @param field The field to sort by.
|
||||
* @throws IOException If the term enumerator fails.
|
||||
*/
|
||||
protected final void initializeSort (IndexReader reader, String field)
|
||||
throws IOException {
|
||||
|
||||
fieldOrder = (int[]) fieldCache.get (field);
|
||||
if (fieldOrder == null || fieldOrder.length != reader.maxDoc()) {
|
||||
fieldOrder = new int [reader.maxDoc()];
|
||||
|
||||
TermEnum enumerator = reader.terms (new Term (field, ""));
|
||||
TermDocs termDocs = reader.termDocs();
|
||||
if (enumerator.term() == null) {
|
||||
throw new RuntimeException ("no terms in field "+field);
|
||||
}
|
||||
|
||||
try {
|
||||
Term term = enumerator.term();
|
||||
while (term.field() == field) {
|
||||
termDocs.seek (term);
|
||||
if (termDocs.next()) {
|
||||
fieldOrder[termDocs.doc()] = Integer.parseInt (term.text());
|
||||
} else {
|
||||
throw new RuntimeException ("termDocs.next() failed!");
|
||||
}
|
||||
if (!enumerator.next()) {
|
||||
break;
|
||||
}
|
||||
term = enumerator.term();
|
||||
}
|
||||
} finally {
|
||||
enumerator.close();
|
||||
termDocs.close();
|
||||
}
|
||||
|
||||
// be careful how the cache is updated so we
|
||||
// don't have synchronization problems. we do
|
||||
// it this way because we assume updates will be
|
||||
// few compared to the number of reads.
|
||||
HashMap newCache = (HashMap) fieldCache.clone();
|
||||
newCache.put (field, fieldOrder);
|
||||
fieldCache = newCache;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,221 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.BitSet;
|
||||
|
||||
/**
|
||||
* Implements search over an IndexReader using the values of terms in
|
||||
* a field as the primary sort order. Secondary sort is by the order
|
||||
* of documents in the index.
|
||||
*
|
||||
* <p>In this version (0.1) the field to sort by must contain strictly
|
||||
* String representations of Integers (i.e. {@link Integer#toString Integer.toString()}).
|
||||
*
|
||||
* Each document is assumed to have a single term in the given field,
|
||||
* and the value of the term is the document's relative position in
|
||||
* the given sort order. The field must be indexed, but should not be
|
||||
* stored or tokenized:
|
||||
*
|
||||
* <p><code>document.add(new Field("byAlpha", Integer.toString(x), false, true, false));</code>
|
||||
*
|
||||
* <p>In other words, the desired order of documents must be encoded
|
||||
* at the time they are entered into the index. The first document
|
||||
* should have a low value integer, the last document a high value
|
||||
* (i.e. the documents should be numbered <code>1..n</code> where
|
||||
* <code>1</code> is the first and <code>n</code> the last). Values
|
||||
* must be between <code>Integer.MIN_VALUE</code> and
|
||||
* <code>Integer.MAX_VALUE</code> inclusive.
|
||||
*
|
||||
* <p>Then, at search time, the field is designated to be used to sort
|
||||
* the returned hits:
|
||||
*
|
||||
* <p><code>IndexSearcher searcher = new IntegerSortedSearcher(indexReader, "byAlpha");</code>
|
||||
*
|
||||
* <p>or:
|
||||
*
|
||||
* <p><code>IntegerSortedSearcher searcher = new IntegerSortedSearcher(indexReader, "bySomething");
|
||||
* <br>Hits hits = searcher.search(query, filter);
|
||||
* <br>...
|
||||
* <br>searcher.setOrderByField("bySomethingElse");
|
||||
* <br>hits = searcher.search(query, filter);
|
||||
* <br>...
|
||||
* </code>
|
||||
*
|
||||
* <p>Note the above example shows that one of these objects can be
|
||||
* used multiple times, and the sort order changed between usages.
|
||||
*
|
||||
* <p><h3>Memory Usage</h3>
|
||||
*
|
||||
* <p>This object is almost identical to the regular IndexSearcher and
|
||||
* makes no additional memory requirements on its own. Every time the
|
||||
* <code>search()</code> method is called, however, a new
|
||||
* {@link FieldSortedHitQueue FieldSortedHitQueue} object is created.
|
||||
* That object is responsible for putting the hits in the correct order,
|
||||
* and it maintains a cache of information based on the IndexReader
|
||||
* given to it. See its documentation for more information on its
|
||||
* memory usage.
|
||||
*
|
||||
* <p><h3>Concurrency</h3>
|
||||
*
|
||||
* <p>This object has the same behavior during concurrent updates to
|
||||
* the index as does IndexSearcher. Namely, in the default
|
||||
* implementation using
|
||||
* {@link org.apache.lucene.store.FSDirectory FSDirectory}, the index
|
||||
* can be updated (deletes, adds) without harm while this object
|
||||
* exists, but this object will not see the changes. Ultimately this
|
||||
* behavior is a result of the
|
||||
* {@link org.apache.lucene.index.SegmentReader SegmentReader} class
|
||||
* internal to FSDirectory, which caches information about documents
|
||||
* in memory.
|
||||
*
|
||||
* <p>So, in order for IntegerSortedSearcher to be kept up to date with
|
||||
* changes to the index, new instances must be created instead of the
|
||||
* same one used over and over again. This will result in lower
|
||||
* performance than if instances are reused.
|
||||
*
|
||||
* <p><h3>Updates</h3>
|
||||
*
|
||||
* <p>In order to be able to update the index without having to
|
||||
* recalculate all the sort numbers, the numbers should be stored with
|
||||
* "space" between them. That is, sort the documents and number them
|
||||
* <code>1..n</code>. Then, as <code>i</code> goes between
|
||||
* <code>1</code> and <code>n</code>:
|
||||
*
|
||||
* <p><code>document.add(new Field("byAlpha", Integer.toString(i*1000), false, true, false));</code>
|
||||
*
|
||||
* <p>Add a new document sorted between position 1 and 2 by:
|
||||
*
|
||||
* <p><code>document.add(new Field("byAlpha", Integer.toString(1500), false, true, false));</code>
|
||||
*
|
||||
* <p>Be careful not to overun <code>Integer.MAX_VALUE</code>
|
||||
* (<code>2147483647</code>). Periodically a complete reindex should
|
||||
* be run so the sort orders can be "normalized".
|
||||
*
|
||||
* <p>Created: Dec 8, 2003 12:47:26 PM
|
||||
*
|
||||
* @author "Tim Jones" <tjluc@nacimiento.com>
|
||||
* @since lucene 1.3
|
||||
* @version 0.1
|
||||
* @see IndexSearcher
|
||||
*/
|
||||
public class IntegerSortedSearcher
|
||||
extends IndexSearcher {
|
||||
|
||||
/** stores the field being used to sort by **/
|
||||
protected String field;
|
||||
|
||||
/**
|
||||
* Searches the index in the named directory using the given
|
||||
* field as the primary sort.
|
||||
* The terms in the field must contain strictly integers in
|
||||
* the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
|
||||
* @see IndexSearcher(java.lang.String,java.lang.String)
|
||||
*/
|
||||
public IntegerSortedSearcher(String path, String integer_field)
|
||||
throws IOException {
|
||||
this(IndexReader.open(path), integer_field);
|
||||
}
|
||||
|
||||
/**
|
||||
* Searches the index in the provided directory using the
|
||||
* given field as the primary sort.
|
||||
* The terms in the field must contain strictly integers in
|
||||
* the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
|
||||
* @see IndexSearcher(Directory,java.lang.String)
|
||||
*/
|
||||
public IntegerSortedSearcher(Directory directory, String integer_field)
|
||||
throws IOException {
|
||||
this(IndexReader.open(directory), integer_field);
|
||||
}
|
||||
|
||||
/**
|
||||
* Searches the provided index using the given field as the
|
||||
* primary sort.
|
||||
* The terms in the field must contain strictly integers in
|
||||
* the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
|
||||
* @see IndexSearcher(IndexReader)
|
||||
*/
|
||||
public IntegerSortedSearcher(IndexReader r, String integer_field) {
|
||||
super(r);
|
||||
this.field = integer_field.intern();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the field to order results by. This can be called
|
||||
* multiple times per instance of IntegerSortedSearcher.
|
||||
* @param integer_field The field to sort results by.
|
||||
*/
|
||||
public void setOrderByField(String integer_field) {
|
||||
this.field = integer_field.intern();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the name of the field currently being used
|
||||
* to sort results by.
|
||||
* @return Field name.
|
||||
*/
|
||||
public String getOrderByField() {
|
||||
return field;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finds the top <code>nDocs</code>
|
||||
* hits for <code>query</code>, applying <code>filter</code> if non-null.
|
||||
*
|
||||
* Overrides IndexSearcher.search to use a FieldSortedHitQueue instead of the
|
||||
* default HitQueue.
|
||||
*
|
||||
* @see IndexSearcher#search
|
||||
*/
|
||||
public TopDocs search(Query query, Filter filter, final int nDocs)
|
||||
throws IOException {
|
||||
|
||||
Scorer scorer = query.weight(this).scorer(reader);
|
||||
if (scorer == null) {
|
||||
return new TopDocs(0, new ScoreDoc[0]);
|
||||
}
|
||||
|
||||
final BitSet bits = filter != null ? filter.bits(reader) : null;
|
||||
final FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, field, nDocs);
|
||||
final int[] totalHits = new int[1];
|
||||
scorer.score(
|
||||
new HitCollector() {
|
||||
public final void collect(int doc, float score) {
|
||||
if (score > 0.0f && // ignore zeroed buckets
|
||||
(bits == null || bits.get(doc))) { // skip docs not in bits
|
||||
totalHits[0]++;
|
||||
hq.insert(new ScoreDoc(doc, score));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
|
||||
for (int i = hq.size() - 1; i >= 0; i--) { // put docs in array
|
||||
scoreDocs[i] = (ScoreDoc) hq.pop();
|
||||
}
|
||||
|
||||
return new TopDocs(totalHits[0], scoreDocs);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue