From 7495a9d75bb2efde2f76d68b376560ab86693cd9 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 1 Nov 2017 19:31:24 -0400 Subject: [PATCH] LUCENE-8025: Use totalTermFreq=sumDocFreq when scoring DOCS_ONLY fields --- lucene/CHANGES.txt | 5 ++++ .../search/similarities/BM25Similarity.java | 20 +++++++++------ .../search/similarities/SimilarityBase.java | 25 ++++++++++--------- 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index d0ed234bef0..fc1859fa5a2 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -36,6 +36,11 @@ Bug Fixes refreshes or commits, and high indexing thread counts, do not overflow an int (Mykhailo Demianenko via Mike McCandless) +* LUCENE-8025: Use totalTermFreq=sumDocFreq when scoring DOCS_ONLY fields + that omit term frequency information, as it is equivalent in that case. + Previously bogus numbers were used, and many similarities would + completely degrade. (Robert Muir, Adrien Grand) + Optimizations * LUCENE-8018: Smaller FieldInfos memory footprint by not retaining unnecessary diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index e56932bcb81..47561e43ceb 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -83,17 +83,21 @@ public class BM25Similarity extends Similarity { return 1; } - /** The default implementation computes the average as sumTotalTermFreq / docCount, - * or returns 1 if the index does not store sumTotalTermFreq: - * any field that omits frequency information). */ + /** The default implementation computes the average as sumTotalTermFreq / docCount */ protected float avgFieldLength(CollectionStatistics collectionStats) { - final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); - if (sumTotalTermFreq <= 0) { - return 1f; // field does not exist, or stat is unsupported + final long sumTotalTermFreq; + if (collectionStats.sumTotalTermFreq() == -1) { + // frequencies are omitted (tf=1), its # of postings + if (collectionStats.sumDocFreq() == -1) { + // theoretical case only: remove! + return 1f; + } + sumTotalTermFreq = collectionStats.sumDocFreq(); } else { - final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); - return (float) (sumTotalTermFreq / (double) docCount); + sumTotalTermFreq = collectionStats.sumTotalTermFreq(); } + final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); + return (float) (sumTotalTermFreq / (double) docCount); } /** diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java index 68f8e5d0109..babef8f5e03 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java @@ -107,7 +107,7 @@ public abstract class SimilarityBase extends Similarity { long docFreq = termStats.docFreq(); long totalTermFreq = termStats.totalTermFreq(); - // codec does not supply totalTermFreq: substitute docFreq + // frequencies are omitted, all postings have tf=1, so totalTermFreq = docFreq if (totalTermFreq == -1) { totalTermFreq = docFreq; } @@ -115,18 +115,19 @@ public abstract class SimilarityBase extends Similarity { final long numberOfFieldTokens; final double avgFieldLength; - long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); - - if (sumTotalTermFreq <= 0) { - // field does not exist; - // We have to provide something if codec doesnt supply these measures, - // or if someone omitted frequencies for the field... negative values cause - // NaN/Inf for some scorers. - numberOfFieldTokens = docFreq; - avgFieldLength = 1; + if (collectionStats.sumTotalTermFreq() == -1) { + // frequencies are omitted, so sumTotalTermFreq = # postings + if (collectionStats.sumDocFreq() == -1) { + // theoretical case only: remove! + numberOfFieldTokens = docFreq; + avgFieldLength = 1f; + } else { + numberOfFieldTokens = collectionStats.sumDocFreq(); + avgFieldLength = (float) (collectionStats.sumDocFreq() / (double)numberOfDocuments); + } } else { - numberOfFieldTokens = sumTotalTermFreq; - avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments; + numberOfFieldTokens = collectionStats.sumTotalTermFreq(); + avgFieldLength = (float) (collectionStats.sumTotalTermFreq() / (double)numberOfDocuments); } // TODO: add sumDocFreq for field (numberOfFieldPostings)