From 9b25df1a389c21f6b415ca71c29b4b18bb0f7a39 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 27 Jan 2012 13:04:05 +0000 Subject: [PATCH] LUCENE-3722: enable sims to score > 2B docs for the distributed case git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1236668 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 ++++ lucene/MIGRATE.txt | 4 ++++ .../lucene/search/CollectionStatistics.java | 10 +++++----- .../apache/lucene/search/TermStatistics.java | 6 +++--- .../search/similarities/AfterEffectB.java | 2 +- .../search/similarities/BM25Similarity.java | 10 +++++----- .../search/similarities/BasicModelIF.java | 2 +- .../search/similarities/BasicModelIn.java | 4 ++-- .../search/similarities/BasicModelIne.java | 2 +- .../lucene/search/similarities/BasicStats.java | 12 ++++++------ .../search/similarities/DefaultSimilarity.java | 2 +- .../search/similarities/SimilarityBase.java | 4 ++-- .../search/similarities/TFIDFSimilarity.java | 18 +++++++++--------- .../lucene/search/ShardSearchingTestBase.java | 10 +++++----- .../org/apache/lucene/index/TestOmitTf.java | 2 +- .../lucene/search/TestDisjunctionMaxQuery.java | 2 +- .../apache/lucene/search/TestSimilarity.java | 2 +- .../lucene/search/TestSimilarityProvider.java | 4 ++-- .../search/payloads/TestPayloadTermQuery.java | 2 +- 19 files changed, 55 insertions(+), 47 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 27e1c229e63..daea07f6dd7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -393,6 +393,10 @@ API Changes (Mike McCandless, Uwe Schindler, Robert Muir, Chris Male, Yonik Seeley, Jason Rutherglen, Paul Elschot) +* LUCENE-3722: Similarity methods and collection/term statistics now take + long instead of int (to enable distributed scoring of > 2B docs). + (Yonik Seeley, Andrzej Bialecki, Robert Muir) + New features * LUCENE-2604: Added RegexpQuery support to QueryParser. Regular expressions diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index dd50dc4a93f..38e35d7214f 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -397,6 +397,10 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing The scorePayload method now takes a BytesRef. It is never null. +* LUCENE-3722: Similarity methods and collection/term statistics now take + long instead of int (to enable distributed scoring of > 2B docs). + For example, in TFIDFSimilarity idf(int, int) is now idf(long, long). + * LUCENE-3559: The methods "docFreq" and "maxDoc" on IndexSearcher were removed, as these are no longer used by the scoring system. diff --git a/lucene/src/java/org/apache/lucene/search/CollectionStatistics.java b/lucene/src/java/org/apache/lucene/search/CollectionStatistics.java index d4079ab4dc1..e0bc3e82997 100644 --- a/lucene/src/java/org/apache/lucene/search/CollectionStatistics.java +++ b/lucene/src/java/org/apache/lucene/search/CollectionStatistics.java @@ -26,12 +26,12 @@ import org.apache.lucene.index.Terms; // javadocs */ public class CollectionStatistics { private final String field; - private final int maxDoc; - private final int docCount; + private final long maxDoc; + private final long docCount; private final long sumTotalTermFreq; private final long sumDocFreq; - public CollectionStatistics(String field, int maxDoc, int docCount, long sumTotalTermFreq, long sumDocFreq) { + public CollectionStatistics(String field, long maxDoc, long docCount, long sumTotalTermFreq, long sumDocFreq) { assert maxDoc >= 0; assert docCount >= -1 && docCount <= maxDoc; // #docs with field must be <= #docs assert sumDocFreq >= -1; @@ -51,14 +51,14 @@ public class CollectionStatistics { /** returns the total number of documents, regardless of * whether they all contain values for this field. * @see IndexReader#maxDoc() */ - public final int maxDoc() { + public final long maxDoc() { return maxDoc; } /** returns the total number of documents that * have at least one term for this field. * @see Terms#getDocCount() */ - public final int docCount() { + public final long docCount() { return docCount; } diff --git a/lucene/src/java/org/apache/lucene/search/TermStatistics.java b/lucene/src/java/org/apache/lucene/search/TermStatistics.java index 839567c448d..dade6608b94 100644 --- a/lucene/src/java/org/apache/lucene/search/TermStatistics.java +++ b/lucene/src/java/org/apache/lucene/search/TermStatistics.java @@ -25,10 +25,10 @@ import org.apache.lucene.util.BytesRef; */ public class TermStatistics { private final BytesRef term; - private final int docFreq; + private final long docFreq; private final long totalTermFreq; - public TermStatistics(BytesRef term, int docFreq, long totalTermFreq) { + public TermStatistics(BytesRef term, long docFreq, long totalTermFreq) { assert docFreq >= 0; assert totalTermFreq == -1 || totalTermFreq >= docFreq; // #positions must be >= #postings this.term = term; @@ -43,7 +43,7 @@ public class TermStatistics { /** returns the number of documents this term occurs in * @see IndexReader#docFreq(String, BytesRef) */ - public final int docFreq() { + public final long docFreq() { return docFreq; } diff --git a/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java index 62d3bd6f505..49ddbc01cc3 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java @@ -27,7 +27,7 @@ public class AfterEffectB extends AfterEffect { @Override public final float score(BasicStats stats, float tfn) { long F = stats.getTotalTermFreq()+1; - int n = stats.getDocFreq()+1; + long n = stats.getDocFreq()+1; return (F + 1) / (n * (tfn + 1)); } diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index 3e0be17ef13..68dc437f591 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -58,7 +58,7 @@ public class BM25Similarity extends Similarity { } /** Implemented as log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5)). */ - protected float idf(int docFreq, int numDocs) { + protected float idf(long docFreq, long numDocs) { return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D)); } @@ -131,19 +131,19 @@ public class BM25Similarity extends Similarity { } public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { - final int df = termStats.docFreq(); - final int max = collectionStats.maxDoc(); + final long df = termStats.docFreq(); + final long max = collectionStats.maxDoc(); final float idf = idf(df, max); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); } public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { - final int max = collectionStats.maxDoc(); + final long max = collectionStats.maxDoc(); float idf = 0.0f; final Explanation exp = new Explanation(); exp.setDescription("idf(), sum of:"); for (final TermStatistics stat : termStats ) { - final int df = stat.docFreq(); + final long df = stat.docFreq(); final float termIdf = idf(df, max); exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java index 3cef323d11c..fda74719b81 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java @@ -26,7 +26,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2; public class BasicModelIF extends BasicModel { @Override public final float score(BasicStats stats, float tfn) { - int N = stats.getNumberOfDocuments(); + long N = stats.getNumberOfDocuments(); long F = stats.getTotalTermFreq(); return tfn * (float)(log2(1 + (N + 1) / (F + 0.5))); } diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java index a61222e5075..3f8afaf4ceb 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java @@ -27,8 +27,8 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2; public class BasicModelIn extends BasicModel { @Override public final float score(BasicStats stats, float tfn) { - int N = stats.getNumberOfDocuments(); - int n = stats.getDocFreq(); + long N = stats.getNumberOfDocuments(); + long n = stats.getDocFreq(); return tfn * (float)(log2((N + 1) / (n + 0.5))); } diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java index cdbdeb4edd1..a2262b66f96 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java @@ -27,7 +27,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2; public class BasicModelIne extends BasicModel { @Override public final float score(BasicStats stats, float tfn) { - int N = stats.getNumberOfDocuments(); + long N = stats.getNumberOfDocuments(); long F = stats.getTotalTermFreq(); double ne = N * (1 - Math.pow((N - 1) / (double)N, F)); return tfn * (float)(log2((N + 1) / (ne + 0.5))); diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java index a96e7a0aa74..7bde718d510 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java @@ -25,13 +25,13 @@ import org.apache.lucene.index.Terms; */ public class BasicStats extends Similarity.Stats { /** The number of documents. */ - protected int numberOfDocuments; + protected long numberOfDocuments; /** The total number of tokens in the field. */ protected long numberOfFieldTokens; /** The average field length. */ protected float avgFieldLength; /** The document frequency. */ - protected int docFreq; + protected long docFreq; /** The total number of occurrences of this term across all documents. */ protected long totalTermFreq; @@ -55,12 +55,12 @@ public class BasicStats extends Similarity.Stats { // ------------------------- Getter/setter methods ------------------------- /** Returns the number of documents. */ - public int getNumberOfDocuments() { + public long getNumberOfDocuments() { return numberOfDocuments; } /** Sets the number of documents. */ - public void setNumberOfDocuments(int numberOfDocuments) { + public void setNumberOfDocuments(long numberOfDocuments) { this.numberOfDocuments = numberOfDocuments; } @@ -91,12 +91,12 @@ public class BasicStats extends Similarity.Stats { } /** Returns the document frequency. */ - public int getDocFreq() { + public long getDocFreq() { return docFreq; } /** Sets the document frequency. */ - public void setDocFreq(int docFreq) { + public void setDocFreq(long docFreq) { this.docFreq = docFreq; } diff --git a/lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java index f2058dd973f..f56aed5418e 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java @@ -62,7 +62,7 @@ public class DefaultSimilarity extends TFIDFSimilarity { /** Implemented as log(numDocs/(docFreq+1)) + 1. */ @Override - public float idf(int docFreq, int numDocs) { + public float idf(long docFreq, long numDocs) { return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); } diff --git a/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java index 70ef26256df..eb9d0c0fb2c 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java @@ -89,9 +89,9 @@ public abstract class SimilarityBase extends Similarity { protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { // #positions(field) must be >= #positions(term) assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq(); - int numberOfDocuments = collectionStats.maxDoc(); + long numberOfDocuments = collectionStats.maxDoc(); - int docFreq = termStats.docFreq(); + long docFreq = termStats.docFreq(); long totalTermFreq = termStats.totalTermFreq(); // codec does not supply totalTermFreq: substitute docFreq diff --git a/lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java index a61e6a96ce4..e33860ae484 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java @@ -333,13 +333,13 @@ import org.apache.lucene.util.SmallFloat; * idf(t) appears for t in both the query and the document, * hence it is squared in the equation. * The default computation for idf(t) in - * {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) DefaultSimilarity} is: + * {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(long, long) DefaultSimilarity} is: * *
 
* * * *
- * {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) idf(t)}  =   + * {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(long, long) idf(t)}  =   * * 1 + log ( @@ -526,7 +526,7 @@ import org.apache.lucene.util.SmallFloat; public abstract class TFIDFSimilarity extends Similarity { /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(int, int)} + * document. This value is multiplied by the {@link #idf(long, long)} * factor for each term in the query and these products are then summed to * form the initial score for a document. * @@ -545,7 +545,7 @@ public abstract class TFIDFSimilarity extends Similarity { } /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(int, int)} + * document. This value is multiplied by the {@link #idf(long, long)} * factor for each term in the query and these products are then summed to * form the initial score for a document. * @@ -583,8 +583,8 @@ public abstract class TFIDFSimilarity extends Similarity { * @throws IOException */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { - final int df = termStats.docFreq(); - final int max = collectionStats.maxDoc(); + final long df = termStats.docFreq(); + final long max = collectionStats.maxDoc(); final float idf = idf(df, max); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); } @@ -604,12 +604,12 @@ public abstract class TFIDFSimilarity extends Similarity { * @throws IOException */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { - final int max = collectionStats.maxDoc(); + final long max = collectionStats.maxDoc(); float idf = 0.0f; final Explanation exp = new Explanation(); exp.setDescription("idf(), sum of:"); for (final TermStatistics stat : termStats ) { - final int df = stat.docFreq(); + final long df = stat.docFreq(); final float termIdf = idf(df, max); exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; @@ -631,7 +631,7 @@ public abstract class TFIDFSimilarity extends Similarity { * @param numDocs the total number of documents in the collection * @return a score factor based on the term's document frequency */ - public abstract float idf(int docFreq, int numDocs); + public abstract float idf(long docFreq, long numDocs); /** Cache of decoded bytes. */ private static final float[] NORM_TABLE = new float[256]; diff --git a/lucene/src/test-framework/java/org/apache/lucene/search/ShardSearchingTestBase.java b/lucene/src/test-framework/java/org/apache/lucene/search/ShardSearchingTestBase.java index 0bba6adc8d5..8d73de5c937 100644 --- a/lucene/src/test-framework/java/org/apache/lucene/search/ShardSearchingTestBase.java +++ b/lucene/src/test-framework/java/org/apache/lucene/search/ShardSearchingTestBase.java @@ -253,7 +253,7 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase { @Override public TermStatistics termStatistics(Term term, TermContext context) throws IOException { assert term != null; - int docFreq = 0; + long docFreq = 0; long totalTermFreq = 0; for(int nodeID=0;nodeID= 0 && nodeDocFreq >= 0) { docFreq += nodeDocFreq; } else { @@ -291,10 +291,10 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase { // TODO: we could compute this on init and cache, // since we are re-inited whenever any nodes have a // new reader - int docCount = 0; + long docCount = 0; long sumTotalTermFreq = 0; long sumDocFreq = 0; - int maxDoc = 0; + long maxDoc = 0; for(int nodeID=0;nodeID