LUCENE-3722: enable sims to score > 2B docs for the distributed case

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1236668 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-01-27 13:04:05 +00:00
parent 17a5d3e5cb
commit 9b25df1a38
19 changed files with 55 additions and 47 deletions

View File

@ -393,6 +393,10 @@ API Changes
(Mike McCandless, Uwe Schindler, Robert Muir, Chris Male, Yonik Seeley,
Jason Rutherglen, Paul Elschot)
* LUCENE-3722: Similarity methods and collection/term statistics now take
long instead of int (to enable distributed scoring of > 2B docs).
(Yonik Seeley, Andrzej Bialecki, Robert Muir)
New features
* LUCENE-2604: Added RegexpQuery support to QueryParser. Regular expressions

View File

@ -397,6 +397,10 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
The scorePayload method now takes a BytesRef. It is never null.
* LUCENE-3722: Similarity methods and collection/term statistics now take
long instead of int (to enable distributed scoring of > 2B docs).
For example, in TFIDFSimilarity idf(int, int) is now idf(long, long).
* LUCENE-3559: The methods "docFreq" and "maxDoc" on IndexSearcher were removed,
as these are no longer used by the scoring system.

View File

@ -26,12 +26,12 @@ import org.apache.lucene.index.Terms; // javadocs
*/
public class CollectionStatistics {
private final String field;
private final int maxDoc;
private final int docCount;
private final long maxDoc;
private final long docCount;
private final long sumTotalTermFreq;
private final long sumDocFreq;
public CollectionStatistics(String field, int maxDoc, int docCount, long sumTotalTermFreq, long sumDocFreq) {
public CollectionStatistics(String field, long maxDoc, long docCount, long sumTotalTermFreq, long sumDocFreq) {
assert maxDoc >= 0;
assert docCount >= -1 && docCount <= maxDoc; // #docs with field must be <= #docs
assert sumDocFreq >= -1;
@ -51,14 +51,14 @@ public class CollectionStatistics {
/** returns the total number of documents, regardless of
* whether they all contain values for this field.
* @see IndexReader#maxDoc() */
public final int maxDoc() {
public final long maxDoc() {
return maxDoc;
}
/** returns the total number of documents that
* have at least one term for this field.
* @see Terms#getDocCount() */
public final int docCount() {
public final long docCount() {
return docCount;
}

View File

@ -25,10 +25,10 @@ import org.apache.lucene.util.BytesRef;
*/
public class TermStatistics {
private final BytesRef term;
private final int docFreq;
private final long docFreq;
private final long totalTermFreq;
public TermStatistics(BytesRef term, int docFreq, long totalTermFreq) {
public TermStatistics(BytesRef term, long docFreq, long totalTermFreq) {
assert docFreq >= 0;
assert totalTermFreq == -1 || totalTermFreq >= docFreq; // #positions must be >= #postings
this.term = term;
@ -43,7 +43,7 @@ public class TermStatistics {
/** returns the number of documents this term occurs in
* @see IndexReader#docFreq(String, BytesRef) */
public final int docFreq() {
public final long docFreq() {
return docFreq;
}

View File

@ -27,7 +27,7 @@ public class AfterEffectB extends AfterEffect {
@Override
public final float score(BasicStats stats, float tfn) {
long F = stats.getTotalTermFreq()+1;
int n = stats.getDocFreq()+1;
long n = stats.getDocFreq()+1;
return (F + 1) / (n * (tfn + 1));
}

View File

@ -58,7 +58,7 @@ public class BM25Similarity extends Similarity {
}
/** Implemented as <code>log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5))</code>. */
protected float idf(int docFreq, int numDocs) {
protected float idf(long docFreq, long numDocs) {
return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D));
}
@ -131,19 +131,19 @@ public class BM25Similarity extends Similarity {
}
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final int df = termStats.docFreq();
final int max = collectionStats.maxDoc();
final long df = termStats.docFreq();
final long max = collectionStats.maxDoc();
final float idf = idf(df, max);
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
final int max = collectionStats.maxDoc();
final long max = collectionStats.maxDoc();
float idf = 0.0f;
final Explanation exp = new Explanation();
exp.setDescription("idf(), sum of:");
for (final TermStatistics stat : termStats ) {
final int df = stat.docFreq();
final long df = stat.docFreq();
final float termIdf = idf(df, max);
exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
idf += termIdf;

View File

@ -26,7 +26,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
public class BasicModelIF extends BasicModel {
@Override
public final float score(BasicStats stats, float tfn) {
int N = stats.getNumberOfDocuments();
long N = stats.getNumberOfDocuments();
long F = stats.getTotalTermFreq();
return tfn * (float)(log2(1 + (N + 1) / (F + 0.5)));
}

View File

@ -27,8 +27,8 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
public class BasicModelIn extends BasicModel {
@Override
public final float score(BasicStats stats, float tfn) {
int N = stats.getNumberOfDocuments();
int n = stats.getDocFreq();
long N = stats.getNumberOfDocuments();
long n = stats.getDocFreq();
return tfn * (float)(log2((N + 1) / (n + 0.5)));
}

View File

@ -27,7 +27,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
public class BasicModelIne extends BasicModel {
@Override
public final float score(BasicStats stats, float tfn) {
int N = stats.getNumberOfDocuments();
long N = stats.getNumberOfDocuments();
long F = stats.getTotalTermFreq();
double ne = N * (1 - Math.pow((N - 1) / (double)N, F));
return tfn * (float)(log2((N + 1) / (ne + 0.5)));

View File

@ -25,13 +25,13 @@ import org.apache.lucene.index.Terms;
*/
public class BasicStats extends Similarity.Stats {
/** The number of documents. */
protected int numberOfDocuments;
protected long numberOfDocuments;
/** The total number of tokens in the field. */
protected long numberOfFieldTokens;
/** The average field length. */
protected float avgFieldLength;
/** The document frequency. */
protected int docFreq;
protected long docFreq;
/** The total number of occurrences of this term across all documents. */
protected long totalTermFreq;
@ -55,12 +55,12 @@ public class BasicStats extends Similarity.Stats {
// ------------------------- Getter/setter methods -------------------------
/** Returns the number of documents. */
public int getNumberOfDocuments() {
public long getNumberOfDocuments() {
return numberOfDocuments;
}
/** Sets the number of documents. */
public void setNumberOfDocuments(int numberOfDocuments) {
public void setNumberOfDocuments(long numberOfDocuments) {
this.numberOfDocuments = numberOfDocuments;
}
@ -91,12 +91,12 @@ public class BasicStats extends Similarity.Stats {
}
/** Returns the document frequency. */
public int getDocFreq() {
public long getDocFreq() {
return docFreq;
}
/** Sets the document frequency. */
public void setDocFreq(int docFreq) {
public void setDocFreq(long docFreq) {
this.docFreq = docFreq;
}

View File

@ -62,7 +62,7 @@ public class DefaultSimilarity extends TFIDFSimilarity {
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
@Override
public float idf(int docFreq, int numDocs) {
public float idf(long docFreq, long numDocs) {
return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
}

View File

@ -89,9 +89,9 @@ public abstract class SimilarityBase extends Similarity {
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
// #positions(field) must be >= #positions(term)
assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq();
int numberOfDocuments = collectionStats.maxDoc();
long numberOfDocuments = collectionStats.maxDoc();
int docFreq = termStats.docFreq();
long docFreq = termStats.docFreq();
long totalTermFreq = termStats.totalTermFreq();
// codec does not supply totalTermFreq: substitute docFreq

View File

@ -333,13 +333,13 @@ import org.apache.lucene.util.SmallFloat;
* <i>idf(t)</i> appears for <i>t</i> in both the query and the document,
* hence it is squared in the equation.
* The default computation for <i>idf(t)</i> in
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(long, long) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right">
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) idf(t)}&nbsp; = &nbsp;
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(long, long) idf(t)}&nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* 1 + log <big>(</big>
@ -526,7 +526,7 @@ import org.apache.lucene.util.SmallFloat;
public abstract class TFIDFSimilarity extends Similarity {
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* document. This value is multiplied by the {@link #idf(long, long)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
@ -545,7 +545,7 @@ public abstract class TFIDFSimilarity extends Similarity {
}
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* document. This value is multiplied by the {@link #idf(long, long)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
@ -583,8 +583,8 @@ public abstract class TFIDFSimilarity extends Similarity {
* @throws IOException
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final int df = termStats.docFreq();
final int max = collectionStats.maxDoc();
final long df = termStats.docFreq();
final long max = collectionStats.maxDoc();
final float idf = idf(df, max);
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
@ -604,12 +604,12 @@ public abstract class TFIDFSimilarity extends Similarity {
* @throws IOException
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
final int max = collectionStats.maxDoc();
final long max = collectionStats.maxDoc();
float idf = 0.0f;
final Explanation exp = new Explanation();
exp.setDescription("idf(), sum of:");
for (final TermStatistics stat : termStats ) {
final int df = stat.docFreq();
final long df = stat.docFreq();
final float termIdf = idf(df, max);
exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
idf += termIdf;
@ -631,7 +631,7 @@ public abstract class TFIDFSimilarity extends Similarity {
* @param numDocs the total number of documents in the collection
* @return a score factor based on the term's document frequency
*/
public abstract float idf(int docFreq, int numDocs);
public abstract float idf(long docFreq, long numDocs);
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];

View File

@ -253,7 +253,7 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
@Override
public TermStatistics termStatistics(Term term, TermContext context) throws IOException {
assert term != null;
int docFreq = 0;
long docFreq = 0;
long totalTermFreq = 0;
for(int nodeID=0;nodeID<nodeVersions.length;nodeID++) {
@ -268,7 +268,7 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
assert subStats != null;
}
int nodeDocFreq = subStats.docFreq();
long nodeDocFreq = subStats.docFreq();
if (docFreq >= 0 && nodeDocFreq >= 0) {
docFreq += nodeDocFreq;
} else {
@ -291,10 +291,10 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
// TODO: we could compute this on init and cache,
// since we are re-inited whenever any nodes have a
// new reader
int docCount = 0;
long docCount = 0;
long sumTotalTermFreq = 0;
long sumDocFreq = 0;
int maxDoc = 0;
long maxDoc = 0;
for(int nodeID=0;nodeID<nodeVersions.length;nodeID++) {
final FieldAndShardVersion key = new FieldAndShardVersion(nodeID, nodeVersions[nodeID], field);
@ -311,7 +311,7 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
// we better not have a cache miss:
assert nodeStats != null: "myNodeID=" + myNodeID + " nodeID=" + nodeID + " version=" + nodeVersions[nodeID] + " field=" + field;
int nodeDocCount = nodeStats.docCount();
long nodeDocCount = nodeStats.docCount();
if (docCount >= 0 && nodeDocCount >= 0) {
docCount += nodeDocCount;
} else {

View File

@ -47,7 +47,7 @@ public class TestOmitTf extends LuceneTestCase {
@Override public void computeNorm(FieldInvertState state, Norm norm) { norm.setByte(encodeNormValue(state.getBoost())); }
@Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(int docFreq, int numDocs) { return 1.0f; }
@Override public float idf(long docFreq, long numDocs) { return 1.0f; }
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
return new Explanation(1.0f, "Inexplicable");
}

View File

@ -73,7 +73,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
}
@Override
public float idf(int docFreq, int numDocs) {
public float idf(long docFreq, long numDocs) {
return 1.0f;
}
}

View File

@ -49,7 +49,7 @@ public class TestSimilarity extends LuceneTestCase {
@Override public void computeNorm(FieldInvertState state, Norm norm) { norm.setByte(encodeNormValue(state.getBoost())); }
@Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(int docFreq, int numDocs) { return 1.0f; }
@Override public float idf(long docFreq, long numDocs) { return 1.0f; }
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] stats) {
return new Explanation(1.0f, "Inexplicable");
}

View File

@ -129,7 +129,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
}
@Override
public float idf(int docFreq, int numDocs) {
public float idf(long docFreq, long numDocs) {
return 1f;
}
@ -157,7 +157,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
}
@Override
public float idf(int docFreq, int numDocs) {
public float idf(long docFreq, long numDocs) {
return 10f;
}

View File

@ -336,7 +336,7 @@ public class TestPayloadTermQuery extends LuceneTestCase {
}
@Override
public float idf(int docFreq, int numDocs) {
public float idf(long docFreq, long numDocs) {
return 1;
}