mirror of https://github.com/apache/lucene.git
- Initial commit
Submitted by: Tim Jones Reviewed by: Otis, Doug git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150182 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ea247bfec3
commit
b918871d54
|
@ -0,0 +1,206 @@
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.TermEnum;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermDocs;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expert: collects results from a search and sorts them by terms in a
|
||||||
|
* given field in each document.
|
||||||
|
*
|
||||||
|
* <p>In this version (0.1) the field to sort by must contain strictly
|
||||||
|
* String representations of Integers.
|
||||||
|
* See {@link SortedIndexSearcher SortedIndexSearcher} for more
|
||||||
|
* information. Each document is assumed to have a single term in the
|
||||||
|
* given field, and the value of the term is the document's relative
|
||||||
|
* position in the given sort order.
|
||||||
|
*
|
||||||
|
* <p>When one of these objects is created, a TermEnumerator is
|
||||||
|
* created to fetch all the terms in the index for the given field.
|
||||||
|
* The value of each term is assumed to be an integer representing a
|
||||||
|
* sort position. Each document is assumed to contain one of the
|
||||||
|
* terms, indicating where in the sort it belongs.
|
||||||
|
*
|
||||||
|
* <p><h3>Memory Usage</h3>
|
||||||
|
*
|
||||||
|
* <p>A static cache is maintained. This cache contains an integer
|
||||||
|
* array of length <code>IndexReader.maxDoc()</code> for each field
|
||||||
|
* name for which a sort is performed. In other words, the size of
|
||||||
|
* the cache in bytes is:
|
||||||
|
*
|
||||||
|
* <p><code>4 * IndexReader.maxDoc() * (# of different fields actually used to sort)</code>
|
||||||
|
*
|
||||||
|
* <p>Note that the size of the cache is not affected by how many
|
||||||
|
* fields are in the index and <i>might</i> be used to sort - only by
|
||||||
|
* the ones actually used to sort a result set.
|
||||||
|
*
|
||||||
|
* <p>The cache is cleared each time a new <code>IndexReader</code> is
|
||||||
|
* passed in, or if the value returned by <code>maxDoc()</code>
|
||||||
|
* changes for the current IndexReader. This class is not set up to
|
||||||
|
* be able to efficiently sort hits from more than one index
|
||||||
|
* simultaneously.
|
||||||
|
*
|
||||||
|
* <p>Created: Dec 8, 2003 12:56:03 PM
|
||||||
|
*
|
||||||
|
* @author "Tim Jones" <tjluc@nacimiento.com>
|
||||||
|
* @since lucene 1.3
|
||||||
|
* @version 0.1
|
||||||
|
*/
|
||||||
|
public class FieldSortedHitQueue
|
||||||
|
extends PriorityQueue {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Keeps track of the IndexReader which the cache
|
||||||
|
* applies to. If it changes, the cache is cleared.
|
||||||
|
* We only store the hashcode so as not to mess up
|
||||||
|
* garbage collection by having a reference to an
|
||||||
|
* IndexReader.
|
||||||
|
*/
|
||||||
|
protected static int lastReaderHash;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Contains the cache of sort information. The
|
||||||
|
* key is field name, the value an array of int.
|
||||||
|
* A HashMap is used, and we are careful how we
|
||||||
|
* handle synchronization. This is because best
|
||||||
|
* performance is obtained when the same IndexReader
|
||||||
|
* is used over and over, and we therefore perform
|
||||||
|
* many reads and few writes.
|
||||||
|
*/
|
||||||
|
protected static HashMap fieldCache;
|
||||||
|
|
||||||
|
/** The sort information being used by this instance */
|
||||||
|
protected int[] fieldOrder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a hit queue sorted by the given field.
|
||||||
|
* @param reader IndexReader to use.
|
||||||
|
* @param integer_field Field to sort by.
|
||||||
|
* @param size Number of hits to return - see {@link PriorityQueue#initialize(int) initialize}
|
||||||
|
* @throws IOException If the internal term enumerator fails.
|
||||||
|
*/
|
||||||
|
public FieldSortedHitQueue (IndexReader reader, String integer_field, int size)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
int hash = reader.hashCode();
|
||||||
|
if (hash != lastReaderHash) {
|
||||||
|
lastReaderHash = hash;
|
||||||
|
if (fieldCache != null) {
|
||||||
|
fieldCache.clear();
|
||||||
|
}
|
||||||
|
fieldCache = new HashMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
initialize (size);
|
||||||
|
initializeSort (reader, integer_field);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares documents based on the value of the term in the field
|
||||||
|
* being sorted by. Documents which should appear at the top of the
|
||||||
|
* list should have low values in the term; documents which should
|
||||||
|
* appear at the end should have high values.
|
||||||
|
*
|
||||||
|
* <p>In the context of this method, "less than" means "less relevant",
|
||||||
|
* so documents at the top of the list are "greatest" and documents at
|
||||||
|
* the bottom are "least".
|
||||||
|
*
|
||||||
|
* <p>Document A is considered less than Document B
|
||||||
|
* if A.field.term > B.field.term or A.doc > B.doc.
|
||||||
|
*
|
||||||
|
* @param a ScoreDoc object for document a.
|
||||||
|
* @param b ScoreDoc object for document b.
|
||||||
|
* @return true if document a is less than document b.
|
||||||
|
* @see ScoreDoc
|
||||||
|
*/
|
||||||
|
protected final boolean lessThan (Object a, Object b) {
|
||||||
|
ScoreDoc hitA = (ScoreDoc) a;
|
||||||
|
ScoreDoc hitB = (ScoreDoc) b;
|
||||||
|
int scoreA = fieldOrder[hitA.doc];
|
||||||
|
int scoreB = fieldOrder[hitB.doc];
|
||||||
|
if (scoreA == scoreB)
|
||||||
|
return hitA.doc > hitB.doc;
|
||||||
|
else
|
||||||
|
return scoreA > scoreB; // bigger is really less - the ones at the top should be the lowest
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initializes the cache of sort information. <code>fieldCache</code> is queried
|
||||||
|
* to see if it has the term information for the given field.
|
||||||
|
* If so, and if the reader still has the same value for maxDoc()
|
||||||
|
* (note that we assume new IndexReaders are caught during the
|
||||||
|
* constructor), the existing data is used. If not, all the term values
|
||||||
|
* for the given field are fetched. The value of the term is assumed
|
||||||
|
* to be the sort index for any documents containing the term. Documents
|
||||||
|
* should only have one term in the given field. Multiple documents
|
||||||
|
* can share the same term if desired (documents with the same term will
|
||||||
|
* be sorted relative to each other by the order they were placed in
|
||||||
|
* the index).
|
||||||
|
* @param reader The document index.
|
||||||
|
* @param field The field to sort by.
|
||||||
|
* @throws IOException If the term enumerator fails.
|
||||||
|
*/
|
||||||
|
protected final void initializeSort (IndexReader reader, String field)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
fieldOrder = (int[]) fieldCache.get (field);
|
||||||
|
if (fieldOrder == null || fieldOrder.length != reader.maxDoc()) {
|
||||||
|
fieldOrder = new int [reader.maxDoc()];
|
||||||
|
|
||||||
|
TermEnum enumerator = reader.terms (new Term (field, ""));
|
||||||
|
TermDocs termDocs = reader.termDocs();
|
||||||
|
if (enumerator.term() == null) {
|
||||||
|
throw new RuntimeException ("no terms in field "+field);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
Term term = enumerator.term();
|
||||||
|
while (term.field() == field) {
|
||||||
|
termDocs.seek (term);
|
||||||
|
if (termDocs.next()) {
|
||||||
|
fieldOrder[termDocs.doc()] = Integer.parseInt (term.text());
|
||||||
|
} else {
|
||||||
|
throw new RuntimeException ("termDocs.next() failed!");
|
||||||
|
}
|
||||||
|
if (!enumerator.next()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
term = enumerator.term();
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
enumerator.close();
|
||||||
|
termDocs.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// be careful how the cache is updated so we
|
||||||
|
// don't have synchronization problems. we do
|
||||||
|
// it this way because we assume updates will be
|
||||||
|
// few compared to the number of reads.
|
||||||
|
HashMap newCache = (HashMap) fieldCache.clone();
|
||||||
|
newCache.put (field, fieldOrder);
|
||||||
|
fieldCache = newCache;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,221 @@
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.*;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.search.*;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.BitSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implements search over an IndexReader using the values of terms in
|
||||||
|
* a field as the primary sort order. Secondary sort is by the order
|
||||||
|
* of documents in the index.
|
||||||
|
*
|
||||||
|
* <p>In this version (0.1) the field to sort by must contain strictly
|
||||||
|
* String representations of Integers (i.e. {@link Integer#toString Integer.toString()}).
|
||||||
|
*
|
||||||
|
* Each document is assumed to have a single term in the given field,
|
||||||
|
* and the value of the term is the document's relative position in
|
||||||
|
* the given sort order. The field must be indexed, but should not be
|
||||||
|
* stored or tokenized:
|
||||||
|
*
|
||||||
|
* <p><code>document.add(new Field("byAlpha", Integer.toString(x), false, true, false));</code>
|
||||||
|
*
|
||||||
|
* <p>In other words, the desired order of documents must be encoded
|
||||||
|
* at the time they are entered into the index. The first document
|
||||||
|
* should have a low value integer, the last document a high value
|
||||||
|
* (i.e. the documents should be numbered <code>1..n</code> where
|
||||||
|
* <code>1</code> is the first and <code>n</code> the last). Values
|
||||||
|
* must be between <code>Integer.MIN_VALUE</code> and
|
||||||
|
* <code>Integer.MAX_VALUE</code> inclusive.
|
||||||
|
*
|
||||||
|
* <p>Then, at search time, the field is designated to be used to sort
|
||||||
|
* the returned hits:
|
||||||
|
*
|
||||||
|
* <p><code>IndexSearcher searcher = new IntegerSortedSearcher(indexReader, "byAlpha");</code>
|
||||||
|
*
|
||||||
|
* <p>or:
|
||||||
|
*
|
||||||
|
* <p><code>IntegerSortedSearcher searcher = new IntegerSortedSearcher(indexReader, "bySomething");
|
||||||
|
* <br>Hits hits = searcher.search(query, filter);
|
||||||
|
* <br>...
|
||||||
|
* <br>searcher.setOrderByField("bySomethingElse");
|
||||||
|
* <br>hits = searcher.search(query, filter);
|
||||||
|
* <br>...
|
||||||
|
* </code>
|
||||||
|
*
|
||||||
|
* <p>Note the above example shows that one of these objects can be
|
||||||
|
* used multiple times, and the sort order changed between usages.
|
||||||
|
*
|
||||||
|
* <p><h3>Memory Usage</h3>
|
||||||
|
*
|
||||||
|
* <p>This object is almost identical to the regular IndexSearcher and
|
||||||
|
* makes no additional memory requirements on its own. Every time the
|
||||||
|
* <code>search()</code> method is called, however, a new
|
||||||
|
* {@link FieldSortedHitQueue FieldSortedHitQueue} object is created.
|
||||||
|
* That object is responsible for putting the hits in the correct order,
|
||||||
|
* and it maintains a cache of information based on the IndexReader
|
||||||
|
* given to it. See its documentation for more information on its
|
||||||
|
* memory usage.
|
||||||
|
*
|
||||||
|
* <p><h3>Concurrency</h3>
|
||||||
|
*
|
||||||
|
* <p>This object has the same behavior during concurrent updates to
|
||||||
|
* the index as does IndexSearcher. Namely, in the default
|
||||||
|
* implementation using
|
||||||
|
* {@link org.apache.lucene.store.FSDirectory FSDirectory}, the index
|
||||||
|
* can be updated (deletes, adds) without harm while this object
|
||||||
|
* exists, but this object will not see the changes. Ultimately this
|
||||||
|
* behavior is a result of the
|
||||||
|
* {@link org.apache.lucene.index.SegmentReader SegmentReader} class
|
||||||
|
* internal to FSDirectory, which caches information about documents
|
||||||
|
* in memory.
|
||||||
|
*
|
||||||
|
* <p>So, in order for IntegerSortedSearcher to be kept up to date with
|
||||||
|
* changes to the index, new instances must be created instead of the
|
||||||
|
* same one used over and over again. This will result in lower
|
||||||
|
* performance than if instances are reused.
|
||||||
|
*
|
||||||
|
* <p><h3>Updates</h3>
|
||||||
|
*
|
||||||
|
* <p>In order to be able to update the index without having to
|
||||||
|
* recalculate all the sort numbers, the numbers should be stored with
|
||||||
|
* "space" between them. That is, sort the documents and number them
|
||||||
|
* <code>1..n</code>. Then, as <code>i</code> goes between
|
||||||
|
* <code>1</code> and <code>n</code>:
|
||||||
|
*
|
||||||
|
* <p><code>document.add(new Field("byAlpha", Integer.toString(i*1000), false, true, false));</code>
|
||||||
|
*
|
||||||
|
* <p>Add a new document sorted between position 1 and 2 by:
|
||||||
|
*
|
||||||
|
* <p><code>document.add(new Field("byAlpha", Integer.toString(1500), false, true, false));</code>
|
||||||
|
*
|
||||||
|
* <p>Be careful not to overun <code>Integer.MAX_VALUE</code>
|
||||||
|
* (<code>2147483647</code>). Periodically a complete reindex should
|
||||||
|
* be run so the sort orders can be "normalized".
|
||||||
|
*
|
||||||
|
* <p>Created: Dec 8, 2003 12:47:26 PM
|
||||||
|
*
|
||||||
|
* @author "Tim Jones" <tjluc@nacimiento.com>
|
||||||
|
* @since lucene 1.3
|
||||||
|
* @version 0.1
|
||||||
|
* @see IndexSearcher
|
||||||
|
*/
|
||||||
|
public class IntegerSortedSearcher
|
||||||
|
extends IndexSearcher {
|
||||||
|
|
||||||
|
/** stores the field being used to sort by **/
|
||||||
|
protected String field;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Searches the index in the named directory using the given
|
||||||
|
* field as the primary sort.
|
||||||
|
* The terms in the field must contain strictly integers in
|
||||||
|
* the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
|
||||||
|
* @see IndexSearcher(java.lang.String,java.lang.String)
|
||||||
|
*/
|
||||||
|
public IntegerSortedSearcher(String path, String integer_field)
|
||||||
|
throws IOException {
|
||||||
|
this(IndexReader.open(path), integer_field);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Searches the index in the provided directory using the
|
||||||
|
* given field as the primary sort.
|
||||||
|
* The terms in the field must contain strictly integers in
|
||||||
|
* the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
|
||||||
|
* @see IndexSearcher(Directory,java.lang.String)
|
||||||
|
*/
|
||||||
|
public IntegerSortedSearcher(Directory directory, String integer_field)
|
||||||
|
throws IOException {
|
||||||
|
this(IndexReader.open(directory), integer_field);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Searches the provided index using the given field as the
|
||||||
|
* primary sort.
|
||||||
|
* The terms in the field must contain strictly integers in
|
||||||
|
* the range <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive.
|
||||||
|
* @see IndexSearcher(IndexReader)
|
||||||
|
*/
|
||||||
|
public IntegerSortedSearcher(IndexReader r, String integer_field) {
|
||||||
|
super(r);
|
||||||
|
this.field = integer_field.intern();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the field to order results by. This can be called
|
||||||
|
* multiple times per instance of IntegerSortedSearcher.
|
||||||
|
* @param integer_field The field to sort results by.
|
||||||
|
*/
|
||||||
|
public void setOrderByField(String integer_field) {
|
||||||
|
this.field = integer_field.intern();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the name of the field currently being used
|
||||||
|
* to sort results by.
|
||||||
|
* @return Field name.
|
||||||
|
*/
|
||||||
|
public String getOrderByField() {
|
||||||
|
return field;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Finds the top <code>nDocs</code>
|
||||||
|
* hits for <code>query</code>, applying <code>filter</code> if non-null.
|
||||||
|
*
|
||||||
|
* Overrides IndexSearcher.search to use a FieldSortedHitQueue instead of the
|
||||||
|
* default HitQueue.
|
||||||
|
*
|
||||||
|
* @see IndexSearcher#search
|
||||||
|
*/
|
||||||
|
public TopDocs search(Query query, Filter filter, final int nDocs)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
Scorer scorer = query.weight(this).scorer(reader);
|
||||||
|
if (scorer == null) {
|
||||||
|
return new TopDocs(0, new ScoreDoc[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
final BitSet bits = filter != null ? filter.bits(reader) : null;
|
||||||
|
final FieldSortedHitQueue hq = new FieldSortedHitQueue(reader, field, nDocs);
|
||||||
|
final int[] totalHits = new int[1];
|
||||||
|
scorer.score(
|
||||||
|
new HitCollector() {
|
||||||
|
public final void collect(int doc, float score) {
|
||||||
|
if (score > 0.0f && // ignore zeroed buckets
|
||||||
|
(bits == null || bits.get(doc))) { // skip docs not in bits
|
||||||
|
totalHits[0]++;
|
||||||
|
hq.insert(new ScoreDoc(doc, score));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()];
|
||||||
|
for (int i = hq.size() - 1; i >= 0; i--) { // put docs in array
|
||||||
|
scoreDocs[i] = (ScoreDoc) hq.pop();
|
||||||
|
}
|
||||||
|
|
||||||
|
return new TopDocs(totalHits[0], scoreDocs);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue