mirror of https://github.com/apache/lucene.git
LUCENE-3722: enable sims to score > 2B docs for the distributed case
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1236668 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
17a5d3e5cb
commit
9b25df1a38
|
@ -393,6 +393,10 @@ API Changes
|
|||
(Mike McCandless, Uwe Schindler, Robert Muir, Chris Male, Yonik Seeley,
|
||||
Jason Rutherglen, Paul Elschot)
|
||||
|
||||
* LUCENE-3722: Similarity methods and collection/term statistics now take
|
||||
long instead of int (to enable distributed scoring of > 2B docs).
|
||||
(Yonik Seeley, Andrzej Bialecki, Robert Muir)
|
||||
|
||||
New features
|
||||
|
||||
* LUCENE-2604: Added RegexpQuery support to QueryParser. Regular expressions
|
||||
|
|
|
@ -397,6 +397,10 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
|
|||
|
||||
The scorePayload method now takes a BytesRef. It is never null.
|
||||
|
||||
* LUCENE-3722: Similarity methods and collection/term statistics now take
|
||||
long instead of int (to enable distributed scoring of > 2B docs).
|
||||
For example, in TFIDFSimilarity idf(int, int) is now idf(long, long).
|
||||
|
||||
* LUCENE-3559: The methods "docFreq" and "maxDoc" on IndexSearcher were removed,
|
||||
as these are no longer used by the scoring system.
|
||||
|
||||
|
|
|
@ -26,12 +26,12 @@ import org.apache.lucene.index.Terms; // javadocs
|
|||
*/
|
||||
public class CollectionStatistics {
|
||||
private final String field;
|
||||
private final int maxDoc;
|
||||
private final int docCount;
|
||||
private final long maxDoc;
|
||||
private final long docCount;
|
||||
private final long sumTotalTermFreq;
|
||||
private final long sumDocFreq;
|
||||
|
||||
public CollectionStatistics(String field, int maxDoc, int docCount, long sumTotalTermFreq, long sumDocFreq) {
|
||||
public CollectionStatistics(String field, long maxDoc, long docCount, long sumTotalTermFreq, long sumDocFreq) {
|
||||
assert maxDoc >= 0;
|
||||
assert docCount >= -1 && docCount <= maxDoc; // #docs with field must be <= #docs
|
||||
assert sumDocFreq >= -1;
|
||||
|
@ -51,14 +51,14 @@ public class CollectionStatistics {
|
|||
/** returns the total number of documents, regardless of
|
||||
* whether they all contain values for this field.
|
||||
* @see IndexReader#maxDoc() */
|
||||
public final int maxDoc() {
|
||||
public final long maxDoc() {
|
||||
return maxDoc;
|
||||
}
|
||||
|
||||
/** returns the total number of documents that
|
||||
* have at least one term for this field.
|
||||
* @see Terms#getDocCount() */
|
||||
public final int docCount() {
|
||||
public final long docCount() {
|
||||
return docCount;
|
||||
}
|
||||
|
||||
|
|
|
@ -25,10 +25,10 @@ import org.apache.lucene.util.BytesRef;
|
|||
*/
|
||||
public class TermStatistics {
|
||||
private final BytesRef term;
|
||||
private final int docFreq;
|
||||
private final long docFreq;
|
||||
private final long totalTermFreq;
|
||||
|
||||
public TermStatistics(BytesRef term, int docFreq, long totalTermFreq) {
|
||||
public TermStatistics(BytesRef term, long docFreq, long totalTermFreq) {
|
||||
assert docFreq >= 0;
|
||||
assert totalTermFreq == -1 || totalTermFreq >= docFreq; // #positions must be >= #postings
|
||||
this.term = term;
|
||||
|
@ -43,7 +43,7 @@ public class TermStatistics {
|
|||
|
||||
/** returns the number of documents this term occurs in
|
||||
* @see IndexReader#docFreq(String, BytesRef) */
|
||||
public final int docFreq() {
|
||||
public final long docFreq() {
|
||||
return docFreq;
|
||||
}
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ public class AfterEffectB extends AfterEffect {
|
|||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
long F = stats.getTotalTermFreq()+1;
|
||||
int n = stats.getDocFreq()+1;
|
||||
long n = stats.getDocFreq()+1;
|
||||
return (F + 1) / (n * (tfn + 1));
|
||||
}
|
||||
|
||||
|
|
|
@ -58,7 +58,7 @@ public class BM25Similarity extends Similarity {
|
|||
}
|
||||
|
||||
/** Implemented as <code>log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5))</code>. */
|
||||
protected float idf(int docFreq, int numDocs) {
|
||||
protected float idf(long docFreq, long numDocs) {
|
||||
return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D));
|
||||
}
|
||||
|
||||
|
@ -131,19 +131,19 @@ public class BM25Similarity extends Similarity {
|
|||
}
|
||||
|
||||
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
|
||||
final int df = termStats.docFreq();
|
||||
final int max = collectionStats.maxDoc();
|
||||
final long df = termStats.docFreq();
|
||||
final long max = collectionStats.maxDoc();
|
||||
final float idf = idf(df, max);
|
||||
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
|
||||
}
|
||||
|
||||
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
|
||||
final int max = collectionStats.maxDoc();
|
||||
final long max = collectionStats.maxDoc();
|
||||
float idf = 0.0f;
|
||||
final Explanation exp = new Explanation();
|
||||
exp.setDescription("idf(), sum of:");
|
||||
for (final TermStatistics stat : termStats ) {
|
||||
final int df = stat.docFreq();
|
||||
final long df = stat.docFreq();
|
||||
final float termIdf = idf(df, max);
|
||||
exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
|
||||
idf += termIdf;
|
||||
|
|
|
@ -26,7 +26,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
|||
public class BasicModelIF extends BasicModel {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
int N = stats.getNumberOfDocuments();
|
||||
long N = stats.getNumberOfDocuments();
|
||||
long F = stats.getTotalTermFreq();
|
||||
return tfn * (float)(log2(1 + (N + 1) / (F + 0.5)));
|
||||
}
|
||||
|
|
|
@ -27,8 +27,8 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
|||
public class BasicModelIn extends BasicModel {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
int N = stats.getNumberOfDocuments();
|
||||
int n = stats.getDocFreq();
|
||||
long N = stats.getNumberOfDocuments();
|
||||
long n = stats.getDocFreq();
|
||||
return tfn * (float)(log2((N + 1) / (n + 0.5)));
|
||||
}
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
|||
public class BasicModelIne extends BasicModel {
|
||||
@Override
|
||||
public final float score(BasicStats stats, float tfn) {
|
||||
int N = stats.getNumberOfDocuments();
|
||||
long N = stats.getNumberOfDocuments();
|
||||
long F = stats.getTotalTermFreq();
|
||||
double ne = N * (1 - Math.pow((N - 1) / (double)N, F));
|
||||
return tfn * (float)(log2((N + 1) / (ne + 0.5)));
|
||||
|
|
|
@ -25,13 +25,13 @@ import org.apache.lucene.index.Terms;
|
|||
*/
|
||||
public class BasicStats extends Similarity.Stats {
|
||||
/** The number of documents. */
|
||||
protected int numberOfDocuments;
|
||||
protected long numberOfDocuments;
|
||||
/** The total number of tokens in the field. */
|
||||
protected long numberOfFieldTokens;
|
||||
/** The average field length. */
|
||||
protected float avgFieldLength;
|
||||
/** The document frequency. */
|
||||
protected int docFreq;
|
||||
protected long docFreq;
|
||||
/** The total number of occurrences of this term across all documents. */
|
||||
protected long totalTermFreq;
|
||||
|
||||
|
@ -55,12 +55,12 @@ public class BasicStats extends Similarity.Stats {
|
|||
// ------------------------- Getter/setter methods -------------------------
|
||||
|
||||
/** Returns the number of documents. */
|
||||
public int getNumberOfDocuments() {
|
||||
public long getNumberOfDocuments() {
|
||||
return numberOfDocuments;
|
||||
}
|
||||
|
||||
/** Sets the number of documents. */
|
||||
public void setNumberOfDocuments(int numberOfDocuments) {
|
||||
public void setNumberOfDocuments(long numberOfDocuments) {
|
||||
this.numberOfDocuments = numberOfDocuments;
|
||||
}
|
||||
|
||||
|
@ -91,12 +91,12 @@ public class BasicStats extends Similarity.Stats {
|
|||
}
|
||||
|
||||
/** Returns the document frequency. */
|
||||
public int getDocFreq() {
|
||||
public long getDocFreq() {
|
||||
return docFreq;
|
||||
}
|
||||
|
||||
/** Sets the document frequency. */
|
||||
public void setDocFreq(int docFreq) {
|
||||
public void setDocFreq(long docFreq) {
|
||||
this.docFreq = docFreq;
|
||||
}
|
||||
|
||||
|
|
|
@ -62,7 +62,7 @@ public class DefaultSimilarity extends TFIDFSimilarity {
|
|||
|
||||
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
|
||||
@Override
|
||||
public float idf(int docFreq, int numDocs) {
|
||||
public float idf(long docFreq, long numDocs) {
|
||||
return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
|
||||
}
|
||||
|
||||
|
|
|
@ -89,9 +89,9 @@ public abstract class SimilarityBase extends Similarity {
|
|||
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
|
||||
// #positions(field) must be >= #positions(term)
|
||||
assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq();
|
||||
int numberOfDocuments = collectionStats.maxDoc();
|
||||
long numberOfDocuments = collectionStats.maxDoc();
|
||||
|
||||
int docFreq = termStats.docFreq();
|
||||
long docFreq = termStats.docFreq();
|
||||
long totalTermFreq = termStats.totalTermFreq();
|
||||
|
||||
// codec does not supply totalTermFreq: substitute docFreq
|
||||
|
|
|
@ -333,13 +333,13 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* <i>idf(t)</i> appears for <i>t</i> in both the query and the document,
|
||||
* hence it is squared in the equation.
|
||||
* The default computation for <i>idf(t)</i> in
|
||||
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
|
||||
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(long, long) DefaultSimilarity} is:
|
||||
*
|
||||
* <br> <br>
|
||||
* <table cellpadding="2" cellspacing="2" border="0" align="center">
|
||||
* <tr>
|
||||
* <td valign="middle" align="right">
|
||||
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) idf(t)} =
|
||||
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(long, long) idf(t)} =
|
||||
* </td>
|
||||
* <td valign="middle" align="center">
|
||||
* 1 + log <big>(</big>
|
||||
|
@ -526,7 +526,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
public abstract class TFIDFSimilarity extends Similarity {
|
||||
|
||||
/** Computes a score factor based on a term or phrase's frequency in a
|
||||
* document. This value is multiplied by the {@link #idf(int, int)}
|
||||
* document. This value is multiplied by the {@link #idf(long, long)}
|
||||
* factor for each term in the query and these products are then summed to
|
||||
* form the initial score for a document.
|
||||
*
|
||||
|
@ -545,7 +545,7 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
}
|
||||
|
||||
/** Computes a score factor based on a term or phrase's frequency in a
|
||||
* document. This value is multiplied by the {@link #idf(int, int)}
|
||||
* document. This value is multiplied by the {@link #idf(long, long)}
|
||||
* factor for each term in the query and these products are then summed to
|
||||
* form the initial score for a document.
|
||||
*
|
||||
|
@ -583,8 +583,8 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
* @throws IOException
|
||||
*/
|
||||
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
|
||||
final int df = termStats.docFreq();
|
||||
final int max = collectionStats.maxDoc();
|
||||
final long df = termStats.docFreq();
|
||||
final long max = collectionStats.maxDoc();
|
||||
final float idf = idf(df, max);
|
||||
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
|
||||
}
|
||||
|
@ -604,12 +604,12 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
* @throws IOException
|
||||
*/
|
||||
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
|
||||
final int max = collectionStats.maxDoc();
|
||||
final long max = collectionStats.maxDoc();
|
||||
float idf = 0.0f;
|
||||
final Explanation exp = new Explanation();
|
||||
exp.setDescription("idf(), sum of:");
|
||||
for (final TermStatistics stat : termStats ) {
|
||||
final int df = stat.docFreq();
|
||||
final long df = stat.docFreq();
|
||||
final float termIdf = idf(df, max);
|
||||
exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
|
||||
idf += termIdf;
|
||||
|
@ -631,7 +631,7 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
* @param numDocs the total number of documents in the collection
|
||||
* @return a score factor based on the term's document frequency
|
||||
*/
|
||||
public abstract float idf(int docFreq, int numDocs);
|
||||
public abstract float idf(long docFreq, long numDocs);
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] NORM_TABLE = new float[256];
|
||||
|
|
|
@ -253,7 +253,7 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
|
|||
@Override
|
||||
public TermStatistics termStatistics(Term term, TermContext context) throws IOException {
|
||||
assert term != null;
|
||||
int docFreq = 0;
|
||||
long docFreq = 0;
|
||||
long totalTermFreq = 0;
|
||||
for(int nodeID=0;nodeID<nodeVersions.length;nodeID++) {
|
||||
|
||||
|
@ -268,7 +268,7 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
|
|||
assert subStats != null;
|
||||
}
|
||||
|
||||
int nodeDocFreq = subStats.docFreq();
|
||||
long nodeDocFreq = subStats.docFreq();
|
||||
if (docFreq >= 0 && nodeDocFreq >= 0) {
|
||||
docFreq += nodeDocFreq;
|
||||
} else {
|
||||
|
@ -291,10 +291,10 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
|
|||
// TODO: we could compute this on init and cache,
|
||||
// since we are re-inited whenever any nodes have a
|
||||
// new reader
|
||||
int docCount = 0;
|
||||
long docCount = 0;
|
||||
long sumTotalTermFreq = 0;
|
||||
long sumDocFreq = 0;
|
||||
int maxDoc = 0;
|
||||
long maxDoc = 0;
|
||||
|
||||
for(int nodeID=0;nodeID<nodeVersions.length;nodeID++) {
|
||||
final FieldAndShardVersion key = new FieldAndShardVersion(nodeID, nodeVersions[nodeID], field);
|
||||
|
@ -311,7 +311,7 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
|
|||
// we better not have a cache miss:
|
||||
assert nodeStats != null: "myNodeID=" + myNodeID + " nodeID=" + nodeID + " version=" + nodeVersions[nodeID] + " field=" + field;
|
||||
|
||||
int nodeDocCount = nodeStats.docCount();
|
||||
long nodeDocCount = nodeStats.docCount();
|
||||
if (docCount >= 0 && nodeDocCount >= 0) {
|
||||
docCount += nodeDocCount;
|
||||
} else {
|
||||
|
|
|
@ -47,7 +47,7 @@ public class TestOmitTf extends LuceneTestCase {
|
|||
@Override public void computeNorm(FieldInvertState state, Norm norm) { norm.setByte(encodeNormValue(state.getBoost())); }
|
||||
@Override public float tf(float freq) { return freq; }
|
||||
@Override public float sloppyFreq(int distance) { return 2.0f; }
|
||||
@Override public float idf(int docFreq, int numDocs) { return 1.0f; }
|
||||
@Override public float idf(long docFreq, long numDocs) { return 1.0f; }
|
||||
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
|
||||
return new Explanation(1.0f, "Inexplicable");
|
||||
}
|
||||
|
|
|
@ -73,7 +73,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public float idf(int docFreq, int numDocs) {
|
||||
public float idf(long docFreq, long numDocs) {
|
||||
return 1.0f;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,7 +49,7 @@ public class TestSimilarity extends LuceneTestCase {
|
|||
@Override public void computeNorm(FieldInvertState state, Norm norm) { norm.setByte(encodeNormValue(state.getBoost())); }
|
||||
@Override public float tf(float freq) { return freq; }
|
||||
@Override public float sloppyFreq(int distance) { return 2.0f; }
|
||||
@Override public float idf(int docFreq, int numDocs) { return 1.0f; }
|
||||
@Override public float idf(long docFreq, long numDocs) { return 1.0f; }
|
||||
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] stats) {
|
||||
return new Explanation(1.0f, "Inexplicable");
|
||||
}
|
||||
|
|
|
@ -129,7 +129,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public float idf(int docFreq, int numDocs) {
|
||||
public float idf(long docFreq, long numDocs) {
|
||||
return 1f;
|
||||
}
|
||||
|
||||
|
@ -157,7 +157,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public float idf(int docFreq, int numDocs) {
|
||||
public float idf(long docFreq, long numDocs) {
|
||||
return 10f;
|
||||
}
|
||||
|
||||
|
|
|
@ -336,7 +336,7 @@ public class TestPayloadTermQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public float idf(int docFreq, int numDocs) {
|
||||
public float idf(long docFreq, long numDocs) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue