LUCENE-6711: Use CollectionStatistics.docCount() for IDF and average field length computations

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1695744 13f79535-47bb-0310-9956-ffa450edef68
2015-08-13 17:37:15 +00:00 · 2015-08-13 17:37:15 +00:00 · 9dc862147e
parent 1e0e15af6c
commit 9dc862147e
16 changed files with 110 additions and 45 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -19,6 +19,10 @@ New Features
  for counting ranges that align with the underlying terms as defined by the
  NumberRangePrefixTree (e.g. familiar date units like days).  (David Smiley)

+* LUCENE-6711: Use CollectionStatistics.docCount() for IDF and average field
+  length computations, to avoid skew from documents that don't have the field.
+  (Ahmet Arslan via Robert Muir)
+
 API Changes

 * LUCENE-3312: The API of oal.document was restructured to
--- a/lucene/MIGRATE.txt
+++ b/lucene/MIGRATE.txt
@ -1,5 +1,22 @@
 # Apache Lucene Migration Guide

+## The way how number of document calculated is changed (LUCENE-6711)
+The number of documents (numDocs) is used to calculate term specificity (idf) and average document length (avdl).
+Prior to LUCENE-6711, collectionStats.maxDoc() was used for the statistics.
+Now, collectionStats.docCount() is used whenever possible, if not maxDocs() is used.
+
+Assume that a collection contains 100 documents, and 50 of them have "keywords" field.
+In this example, maxDocs is 100 while docCount is 50 for the "keywords" field.
+The total number of tokens for "keywords" field is divided by docCount to obtain avdl.
+Therefore, docCount which is the total number of documents that have at least one term for the field, is a more precise metric for optional fields.
+
+DefaultSimilarity does not leverage avdl, so this change would have relatively minor change in the result list.
+Because relative idf values of terms will remain same.
+However, when combined with other factors such as term frequency, relative ranking of documents could change.
+Some Similarity implementations (such as the ones instantiated with NormalizationH2 and BM25) take account into avdl and would have notable change in ranked list.
+Especially if you have a collection of documents with varying lengths.
+Because NormalizationH2 tends to punish documents longer than avdl.
+
 ## Separation of IndexDocument and StoredDocument (LUCENE-3312)

 The API of oal.document was restructured to differentiate between stored 
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@ -63,9 +63,9 @@ public class BM25Similarity extends Similarity {
    this.b  = 0.75f;
  }
  
-  /** Implemented as <code>log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5))</code>. */
-  protected float idf(long docFreq, long numDocs) {
-    return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D));
+  /** Implemented as <code>log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))</code>. */
+  protected float idf(long docFreq, long docCount) {
+    return (float) Math.log(1 + (docCount - docFreq + 0.5D)/(docFreq + 0.5D));
  }
  
  /** Implemented as <code>1 / (distance + 1)</code>. */
@ -78,7 +78,7 @@ public class BM25Similarity extends Similarity {
    return 1;
  }
  
-  /** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>,
+  /** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
   * or returns <code>1</code> if the index does not store sumTotalTermFreq:
   * any field that omits frequency information). */
  protected float avgFieldLength(CollectionStatistics collectionStats) {
@ -86,7 +86,8 @@ public class BM25Similarity extends Similarity {
    if (sumTotalTermFreq <= 0) {
      return 1f;       // field does not exist, or stat is unsupported
    } else {
-      return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc());
+      final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
+      return (float) (sumTotalTermFreq / (double) docCount);
    }
  }
  
@ -150,14 +151,14 @@ public class BM25Similarity extends Similarity {
   * The default implementation uses:
   * 
   * <pre class="prettyprint">
-   * idf(docFreq, searcher.maxDoc());
+   * idf(docFreq, docCount);
   * </pre>
   * 
-   * Note that {@link CollectionStatistics#maxDoc()} is used instead of
+   * Note that {@link CollectionStatistics#docCount()} is used instead of
   * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also 
   * {@link TermStatistics#docFreq()} is used, and when the latter 
-   * is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction.
-   * In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute
+   * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
+   * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
   *   
   * @param collectionStats collection-level statistics
   * @param termStats term-level statistics for the term
@ -166,9 +167,9 @@ public class BM25Similarity extends Similarity {
   */
  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
-    final long max = collectionStats.maxDoc();
-    final float idf = idf(df, max);
-    return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
+    final float idf = idf(df, docCount);
+    return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
  }

  /**
@ -185,13 +186,13 @@ public class BM25Similarity extends Similarity {
   *         for each term.
   */
  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
-    final long max = collectionStats.maxDoc();
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    float idf = 0.0f;
    List<Explanation> details = new ArrayList<>();
    for (final TermStatistics stat : termStats ) {
      final long df = stat.docFreq();
-      final float termIdf = idf(df, max);
-      details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
+      final float termIdf = idf(df, docCount);
+      details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
      idf += termIdf;
    }
    return Explanation.match(idf, "idf(), sum of:", details);
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java
@ -133,10 +133,10 @@ public class DefaultSimilarity extends TFIDFSimilarity {
    return 1;
  }

-  /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
+  /** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
  @Override
-  public float idf(long docFreq, long numDocs) {
-    return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
+  public float idf(long docFreq, long docCount) {
+    return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
  }
    
  /** 
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
@ -102,7 +102,7 @@ public abstract class SimilarityBase extends Similarity {
  protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
    // #positions(field) must be >= #positions(term)
    assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq();
-    long numberOfDocuments = collectionStats.maxDoc();
+    long numberOfDocuments = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    
    long docFreq = termStats.docFreq();
    long totalTermFreq = termStats.totalTermFreq();
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
@ -346,7 +346,7 @@ import org.apache.lucene.util.BytesRef;
 *          </td>
 *          <td valign="middle" align="center">
 *            <table summary="inverse document frequency computation">
- *               <tr><td align="center" style="text-align: center"><small>numDocs</small></td></tr>
+ *               <tr><td align="center" style="text-align: center"><small>docCount</small></td></tr>
 *               <tr><td align="center" style="text-align: center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
 *               <tr><td align="center" style="text-align: center"><small>docFreq+1</small></td></tr>
 *            </table>
@ -566,14 +566,14 @@ public abstract class TFIDFSimilarity extends Similarity {
   * The default implementation uses:
   * 
   * <pre class="prettyprint">
-   * idf(docFreq, searcher.maxDoc());
+   * idf(docFreq, docCount);
   * </pre>
   * 
-   * Note that {@link CollectionStatistics#maxDoc()} is used instead of
+   * Note that {@link CollectionStatistics#docCount()} is used instead of
   * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also 
   * {@link TermStatistics#docFreq()} is used, and when the latter 
-   * is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction.
-   * In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute
+   * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
+   * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
   *   
   * @param collectionStats collection-level statistics
   * @param termStats term-level statistics for the term
@ -582,9 +582,9 @@ public abstract class TFIDFSimilarity extends Similarity {
   */
  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
    final long df = termStats.docFreq();
-    final long max = collectionStats.maxDoc();
-    final float idf = idf(df, max);
-    return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
+    final float idf = idf(df, docCount);
+    return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
  }

  /**
@ -601,13 +601,13 @@ public abstract class TFIDFSimilarity extends Similarity {
   *         for each term.
   */
  public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
-    final long max = collectionStats.maxDoc();
+    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    float idf = 0.0f;
    List<Explanation> subs = new ArrayList<>();
    for (final TermStatistics stat : termStats ) {
      final long df = stat.docFreq();
-      final float termIdf = idf(df, max);
-      subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
+      final float termIdf = idf(df, docCount);
+      subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
      idf += termIdf;
    }
    return Explanation.match(idf, "idf(), sum of:", subs);
@ -623,10 +623,10 @@ public abstract class TFIDFSimilarity extends Similarity {
   * and smaller values for common terms.
   *
   * @param docFreq the number of documents which contain the term
-   * @param numDocs the total number of documents in the collection
+   * @param docCount the total number of documents in the collection
   * @return a score factor based on the term's document frequency
   */
-  public abstract float idf(long docFreq, long numDocs);
+  public abstract float idf(long docFreq, long docCount);

  /**
   * Compute an index-time normalization value for this field instance.
--- a/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
@ -118,7 +118,7 @@ public class TestMaxTermFrequency extends LuceneTestCase {
    @Override public float coord(int overlap, int maxOverlap) { return 0; }
    @Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
    @Override public float tf(float freq) { return 0; }
-    @Override public float idf(long docFreq, long numDocs) { return 0; }
+    @Override public float idf(long docFreq, long docCount) { return 0; }
    @Override public float sloppyFreq(int distance) { return 0; }
    @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
  }
--- a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
@ -67,7 +67,7 @@ public class TestNorms extends LuceneTestCase {
    @Override public float coord(int overlap, int maxOverlap) { return 0; }
    @Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
    @Override public float tf(float freq) { return 0; }
-    @Override public float idf(long docFreq, long numDocs) { return 0; }
+    @Override public float idf(long docFreq, long docCount) { return 0; }
    @Override public float sloppyFreq(int distance) { return 0; }
    @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
  }
--- a/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java
@ -54,7 +54,7 @@ public class TestOmitTf extends LuceneTestCase {
    @Override public float lengthNorm(FieldInvertState state) { return state.getBoost(); }
    @Override public float tf(float freq) { return freq; }
    @Override public float sloppyFreq(int distance) { return 2.0f; }
-    @Override public float idf(long docFreq, long numDocs) { return 1.0f; }
+    @Override public float idf(long docFreq, long docCount) { return 1.0f; }
    @Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
      return Explanation.match(1.0f, "Inexplicable");
    }
--- a/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
@ -80,7 +80,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
    }
    
    @Override
-    public float idf(long docFreq, long numDocs) {
+    public float idf(long docFreq, long docCount) {
      return 1.0f;
    }
  }
--- a/lucene/core/src/test/org/apache/lucene/search/TestSimilarity.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSimilarity.java
@ -46,7 +46,7 @@ public class TestSimilarity extends LuceneTestCase {
    @Override public float lengthNorm(FieldInvertState state) { return state.getBoost(); }
    @Override public float tf(float freq) { return freq; }
    @Override public float sloppyFreq(int distance) { return 2.0f; }
-    @Override public float idf(long docFreq, long numDocs) { return 1.0f; }
+    @Override public float idf(long docFreq, long docCount) { return 1.0f; }
    @Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] stats) {
      return Explanation.match(1.0f, "Inexplicable"); 
    }
--- a/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java
@ -142,7 +142,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
    }

    @Override
-    public float idf(long docFreq, long numDocs) {
+    public float idf(long docFreq, long docCount) {
      return 1f;
    }

@ -190,7 +190,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
    }

    @Override
-    public float idf(long docFreq, long numDocs) {
+    public float idf(long docFreq, long docCount) {
      return 10f;
    }

--- a/lucene/core/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java
@ -288,7 +288,7 @@ public class TestPayloadTermQuery extends LuceneTestCase {
    }

    @Override
-    public float idf(long docFreq, long numDocs) {
+    public float idf(long docFreq, long docCount) {
      return 1;
    }

--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
@ -38,6 +38,7 @@ import org.apache.lucene.search.spans.SpanOrQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;

 /**
 * Tests against all the similarities we have
@ -160,6 +161,48 @@ public class TestSimilarity2 extends LuceneTestCase {
    dir.close();
  }
  
+  /** make sure scores are not skewed by docs not containing the field */
+  public void testNoFieldSkew() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
+    Document doc = new Document();
+    doc.add(newTextField("foo", "bar baz somethingelse", Field.Store.NO));
+    iw.addDocument(doc);
+    IndexReader ir = iw.getReader();
+    IndexSearcher is = newSearcher(ir);
+    
+    BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
+    queryBuilder.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD);
+    queryBuilder.add(new TermQuery(new Term("foo", "baz")), BooleanClause.Occur.SHOULD);
+    Query query = queryBuilder.build();
+    
+    // collect scores
+    List<Float> scores = new ArrayList<>();
+    for (Similarity sim : sims) {
+      is.setSimilarity(sim);
+      scores.add(is.explain(query, 0).getValue());
+    }
+    ir.close();
+    
+    // add some additional docs without the field
+    int numExtraDocs = TestUtil.nextInt(random(), 1, 1000);
+    for (int i = 0; i < numExtraDocs; i++) {
+      iw.addDocument(new Document());
+    }
+    
+    // check scores are the same
+    ir = iw.getReader();
+    is = newSearcher(ir);
+    for (int i = 0; i < sims.size(); i++) {
+      is.setSimilarity(sims.get(i));
+      assertEquals(scores.get(i).floatValue(), is.explain(query, 0).getValue(), 0F);
+    }
+    
+    iw.close();
+    ir.close();
+    dir.close();
+  }
+  
  /** make sure all sims work if TF is omitted */
  public void testOmitTF() throws Exception {
    Directory dir = newDirectory();
--- a/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/function/TestLongNormValueSource.java
@ -203,10 +203,10 @@ class PreciseDefaultSimilarity extends TFIDFSimilarity {
    return 1;
  }

-  /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
+  /** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
  @Override
-  public float idf(long docFreq, long numDocs) {
-    return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
+  public float idf(long docFreq, long docCount) {
+    return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
  }

  /**
--- a/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java
+++ b/solr/core/src/test/org/apache/solr/search/TestExtendedDismaxParser.java
@ -566,8 +566,8 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
  }
  
  public void testAliasingBoost() throws Exception {
-    assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias", "f.myalias.qf","name trait_ss^0.5"), "//result/doc[1]/str[@name='id']=42", "//result/doc[2]/str[@name='id']=47");//doc 42 should score higher than 46
-    assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias^100 name", "f.myalias.qf","trait_ss^0.5"), "//result/doc[1]/str[@name='id']=47", "//result/doc[2]/str[@name='id']=42");//Now the order should be inverse
+    assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias", "f.myalias.qf","name trait_ss^0.1"), "//result/doc[1]/str[@name='id']=42", "//result/doc[2]/str[@name='id']=47");//doc 42 should score higher than 46
+    assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias^100 name", "f.myalias.qf","trait_ss^0.1"), "//result/doc[1]/str[@name='id']=47", "//result/doc[2]/str[@name='id']=42");//Now the order should be inverse
  }
  
  public void testCyclicAliasing() throws Exception {