LUCENE-6711: Use CollectionStatistics.docCount() for IDF and average field length computations

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1695744 13f79535-47bb-0310-9956-ffa450edef68
2015-08-13 17:37:15 +00:00 · 2015-08-13 17:37:15 +00:00 · 9dc862147e
parent 1e0e15af6c
commit 9dc862147e
16 changed files with 110 additions and 45 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -19,6 +19,10 @@ New Features
  for counting ranges that align with the underlying terms as defined by the
  NumberRangePrefixTree (e.g. familiar date units like days).  (David Smiley)
 * LUCENE-6711: Use CollectionStatistics.docCount() for IDF and average field
  length computations, to avoid skew from documents that don't have the field.
  (Ahmet Arslan via Robert Muir)
 API Changes
 * LUCENE-3312: The API of oal.document was restructured to
--- a/lucene/MIGRATE.txt
+++ b/lucene/MIGRATE.txt
@ -1,5 +1,22 @@
 # Apache Lucene Migration Guide
 ## The way how number of document calculated is changed (LUCENE-6711)
 The number of documents (numDocs) is used to calculate term specificity (idf) and average document length (avdl).
 Prior to LUCENE-6711, collectionStats.maxDoc() was used for the statistics.
 Now, collectionStats.docCount() is used whenever possible, if not maxDocs() is used.
 Assume that a collection contains 100 documents, and 50 of them have "keywords" field.
 In this example, maxDocs is 100 while docCount is 50 for the "keywords" field.
 The total number of tokens for "keywords" field is divided by docCount to obtain avdl.
 Therefore, docCount which is the total number of documents that have at least one term for the field, is a more precise metric for optional fields.
 DefaultSimilarity does not leverage avdl, so this change would have relatively minor change in the result list.
 Because relative idf values of terms will remain same.
 However, when combined with other factors such as term frequency, relative ranking of documents could change.
 Some Similarity implementations (such as the ones instantiated with NormalizationH2 and BM25) take account into avdl and would have notable change in ranked list.
 Especially if you have a collection of documents with varying lengths.
 Because NormalizationH2 tends to punish documents longer than avdl.
 ## Separation of IndexDocument and StoredDocument (LUCENE-3312)
 The API of oal.document was restructured to differentiate between stored 
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@ -63,9 +63,9 @@ public class BM25Similarity extends Similarity {
    this.b  = 0.75f;
  }
-  /** Implemented as <code>log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5))</code>. */
+  /** Implemented as <code>log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))</code>. */
-  protected float idf(long docFreq, long numDocs) {
+  protected float idf(long docFreq, long docCount) {
-    return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D));
+    return (float) Math.log(1 + (docCount - docFreq + 0.5D)/(docFreq + 0.5D));
  }
  /** Implemented as <code>1 / (distance + 1)</code>. */
@ -78,7 +78,7 @@ public class BM25Similarity extends Similarity {
    return 1;
  }
-  /** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>,
+  /** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
   * or returns <code>1</code> if the index does not store sumTotalTermFreq:
   * any field that omits frequency information). */
  protected float avgFieldLength(CollectionStatistics collectionStats) {
@ -86,7 +86,8 @@ public class BM25Similarity extends Similarity {
    if (sumTotalTermFreq <= 0) {
      return 1f;       // field does not exist, or stat is unsupported
    } else {
-      return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc());
+      final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
      return (float) (sumTotalTermFreq / (double) docCount);
    }
  }
@ -150,14 +151,14 @@ public class BM25Similarity extends Similarity {
   * The default implementation uses:
   * 
   * <pre class="prettyprint">
-   * idf(docFreq, searcher.maxDoc());
+   * idf(docFreq, docCount);
   * </pre>
   * 
-   * Note that {@link CollectionStatistics#maxDoc()} is used instead of
+   * Note that {@link CollectionStatistics#docCount()} is used instead of
   * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also 
   * {@link TermStatistics#docFreq()} is used, and when the latter 
-   * is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction.
+   * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
-   * In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute
+   * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
   *   
   * @param collectionStats collection-level statistics
   * @param termStats term-level statistics for the term
@ -166,9 +167,9 @@ public class BM25Similarity extends Similarity {
   */
  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
-    final long max = collectionStats.maxDoc();
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
-    final float idf = idf(df, max);
+    final float idf = idf(df, docCount);
-    return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
+    return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
  }
  /**
@ -185,13 +186,13 @@ public class BM25Similarity extends Similarity {
   *         for each term.
   */
  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
-    final long max = collectionStats.maxDoc();
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    float idf = 0.0f;
    List<Explanation> details = new ArrayList<>();
    for (final TermStatistics stat : termStats ) {
      final long df = stat.docFreq();
-      final float termIdf = idf(df, max);
+      final float termIdf = idf(df, docCount);
-      details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
+      details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
      idf += termIdf;
    }
    return Explanation.match(idf, "idf(), sum of:", details);
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java
@ -133,10 +133,10 @@ public class DefaultSimilarity extends TFIDFSimilarity {
    return 1;
  }
-  /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
+  /** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
  @Override
-  public float idf(long docFreq, long numDocs) {
+  public float idf(long docFreq, long docCount) {
-    return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
+    return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
  }
  /** 
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
@ -102,7 +102,7 @@ public abstract class SimilarityBase extends Similarity {
  protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
    // #positions(field) must be >= #positions(term)
    assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq();
-    long numberOfDocuments = collectionStats.maxDoc();
+    long numberOfDocuments = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    long docFreq = termStats.docFreq();
    long totalTermFreq = termStats.totalTermFreq();
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
@ -346,7 +346,7 @@ import org.apache.lucene.util.BytesRef;
 *          </td>
 *          <td valign="middle" align="center">
 *            <table summary="inverse document frequency computation">
- *               <tr><td align="center" style="text-align: center"><small>numDocs</small></td></tr>
+ *               <tr><td align="center" style="text-align: center"><small>docCount</small></td></tr>
 *               <tr><td align="center" style="text-align: center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
 *               <tr><td align="center" style="text-align: center"><small>docFreq+1</small></td></tr>
 *            </table>
@ -566,14 +566,14 @@ public abstract class TFIDFSimilarity extends Similarity {
   * The default implementation uses:
   * 
   * <pre class="prettyprint">
-   * idf(docFreq, searcher.maxDoc());
+   * idf(docFreq, docCount);
   * </pre>
   * 
-   * Note that {@link CollectionStatistics#maxDoc()} is used instead of
+   * Note that {@link CollectionStatistics#docCount()} is used instead of
   * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also 
   * {@link TermStatistics#docFreq()} is used, and when the latter 
-   * is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction.
+   * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
-   * In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute
+   * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
   *   
   * @param collectionStats collection-level statistics
   * @param termStats term-level statistics for the term
@ -582,9 +582,9 @@ public abstract class TFIDFSimilarity extends Similarity {
   */
  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
-    final long max = collectionStats.maxDoc();
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
-    final float idf = idf(df, max);
+    final float idf = idf(df, docCount);
-    return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
+    return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
  }
  /**
@ -601,13 +601,13 @@ public abstract class TFIDFSimilarity extends Similarity {
   *         for each term.
   */
  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
-    final long max = collectionStats.maxDoc();
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    float idf = 0.0f;
    List<Explanation> subs = new ArrayList<>();
    for (final TermStatistics stat : termStats ) {
      final long df = stat.docFreq();
-      final float termIdf = idf(df, max);
+      final float termIdf = idf(df, docCount);
-      subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
+      subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
      idf += termIdf;
    }
    return Explanation.match(idf, "idf(), sum of:", subs);
@ -623,10 +623,10 @@ public abstract class TFIDFSimilarity extends Similarity {
   * and smaller values for common terms.
   *
   * @param docFreq the number of documents which contain the term
-   * @param numDocs the total number of documents in the collection
+   * @param docCount the total number of documents in the collection
   * @return a score factor based on the term's document frequency
   */
-  public abstract float idf(long docFreq, long numDocs);
+  public abstract float idf(long docFreq, long docCount);
  /**
   * Compute an index-time normalization value for this field instance.
--- a/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
@ -118,7 +118,7 @@ public class TestMaxTermFrequency extends LuceneTestCase {
    @Override public float coord(int overlap, int maxOverlap) { return 0; }
    @Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
    @Override public float tf(float freq) { return 0; }
-    @Override public float idf(long docFreq, long numDocs) { return 0; }
+    @Override public float idf(long docFreq, long docCount) { return 0; }
    @Override public float sloppyFreq(int distance) { return 0; }
    @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
  }
--- a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
@ -67,7 +67,7 @@ public class TestNorms extends LuceneTestCase {
    @Override public float coord(int overlap, int maxOverlap) { return 0; }
    @Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
    @Override public float tf(float freq) { return 0; }
-    @Override public float idf(long docFreq, long numDocs) { return 0; }
+    @Override public float idf(long docFreq, long docCount) { return 0; }
    @Override public float sloppyFreq(int distance) { return 0; }
    @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
  }
--- a/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java
@ -54,7 +54,7 @@ public class TestOmitTf extends LuceneTestCase {
    @Override public float lengthNorm(FieldInvertState state) { return state.getBoost(); }
    @Override public float tf(float freq) { return freq; }
    @Override public float sloppyFreq(int distance) { return 2.0f; }
-    @Override public float idf(long docFreq, long numDocs) { return 1.0f; }
+    @Override public float idf(long docFreq, long docCount) { return 1.0f; }
    @Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
      return Explanation.match(1.0f, "Inexplicable");
    }
--- a/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
@ -80,7 +80,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
    }
    @Override
-    public float idf(long docFreq, long numDocs) {
+    public float idf(long docFreq, long docCount) {
      return 1.0f;
    }
  }
--- a/lucene/core/src/test/org/apache/lucene/search/TestSimilarity.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSimilarity.java
@ -46,7 +46,7 @@ public class TestSimilarity extends LuceneTestCase {
    @Override public float lengthNorm(FieldInvertState state) { return state.getBoost(); }
    @Override public float tf(float freq) { return freq; }
    @Override public float sloppyFreq(int distance) { return 2.0f; }
-    @Override public float idf(long docFreq, long numDocs) { return 1.0f; }
+    @Override public float idf(long docFreq, long docCount) { return 1.0f; }
    @Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] stats) {
      return Explanation.match(1.0f, "Inexplicable"); 
    }
--- a/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java
@ -142,7 +142,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
    }
    @Override
-    public float idf(long docFreq, long numDocs) {
+    public float idf(long docFreq, long docCount) {
      return 1f;
    }
@ -190,7 +190,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
    }
    @Override
-    public float idf(long docFreq, long numDocs) {
+    public float idf(long docFreq, long docCount) {
      return 10f;
    }
--- a/lucene/core/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java
@ -288,7 +288,7 @@ public class TestPayloadTermQuery extends LuceneTestCase {
    }
    @Override
-    public float idf(long docFreq, long numDocs) {
+    public float idf(long docFreq, long docCount) {
      return 1;
    }
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
@ -38,6 +38,7 @@ import org.apache.lucene.search.spans.SpanOrQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
 /**
 * Tests against all the similarities we have
@ -160,6 +161,48 @@ public class TestSimilarity2 extends LuceneTestCase {
    dir.close();
  }
  /** make sure scores are not skewed by docs not containing the field */
  public void testNoFieldSkew() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    doc.add(newTextField("foo", "bar baz somethingelse", Field.Store.NO));
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    IndexSearcher is = newSearcher(ir);
    BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
    queryBuilder.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD);
    queryBuilder.add(new TermQuery(new Term("foo", "baz")), BooleanClause.Occur.SHOULD);
    Query query = queryBuilder.build();
    // collect scores
    List<Float> scores = new ArrayList<>();
    for (Similarity sim : sims) {
      is.setSimilarity(sim);
      scores.add(is.explain(query, 0).getValue());
    }
    ir.close();
    // add some additional docs without the field
    int numExtraDocs = TestUtil.nextInt(random(), 1, 1000);
    for (int i = 0; i < numExtraDocs; i++) {
      iw.addDocument(new Document());
    }
    // check scores are the same
    ir = iw.getReader();
    is = newSearcher(ir);
    for (int i = 0; i < sims.size(); i++) {
      is.setSimilarity(sims.get(i));
      assertEquals(scores.get(i).floatValue(), is.explain(query, 0).getValue(), 0F);
    }
    iw.close();
    ir.close();
    dir.close();
  }
  /** make sure all sims work if TF is omitted */
  public void testOmitTF() throws Exception {
    Directory dir = newDirectory();
--- a/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java
@ -203,10 +203,10 @@ class PreciseDefaultSimilarity extends TFIDFSimilarity {
    return 1;
  }
-  /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
+  /** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
  @Override
-  public float idf(long docFreq, long numDocs) {
+  public float idf(long docFreq, long docCount) {
-    return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
+    return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
  }
  /**
--- a/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java
+++ b/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java
@ -566,8 +566,8 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
  }
  public void testAliasingBoost() throws Exception {
-    assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias", "f.myalias.qf","name trait_ss^0.5"), "//result/doc[1]/str[@name='id']=42", "//result/doc[2]/str[@name='id']=47");//doc 42 should score higher than 46
+    assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias", "f.myalias.qf","name trait_ss^0.1"), "//result/doc[1]/str[@name='id']=42", "//result/doc[2]/str[@name='id']=47");//doc 42 should score higher than 46
-    assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias^100 name", "f.myalias.qf","trait_ss^0.5"), "//result/doc[1]/str[@name='id']=47", "//result/doc[2]/str[@name='id']=42");//Now the order should be inverse
+    assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias^100 name", "f.myalias.qf","trait_ss^0.1"), "//result/doc[1]/str[@name='id']=47", "//result/doc[2]/str[@name='id']=42");//Now the order should be inverse
  }
  public void testCyclicAliasing() throws Exception {