LUCENE-8025: Use totalTermFreq=sumDocFreq when scoring DOCS_ONLY fields

This commit is contained in:
Robert Muir 2017-11-01 19:31:24 -04:00
parent 6eea7f70a0
commit 7495a9d75b
3 changed files with 30 additions and 20 deletions

View File

@ -36,6 +36,11 @@ Bug Fixes
refreshes or commits, and high indexing thread counts, do not refreshes or commits, and high indexing thread counts, do not
overflow an int (Mykhailo Demianenko via Mike McCandless) overflow an int (Mykhailo Demianenko via Mike McCandless)
* LUCENE-8025: Use totalTermFreq=sumDocFreq when scoring DOCS_ONLY fields
that omit term frequency information, as it is equivalent in that case.
Previously bogus numbers were used, and many similarities would
completely degrade. (Robert Muir, Adrien Grand)
Optimizations Optimizations
* LUCENE-8018: Smaller FieldInfos memory footprint by not retaining unnecessary * LUCENE-8018: Smaller FieldInfos memory footprint by not retaining unnecessary

View File

@ -83,17 +83,21 @@ public class BM25Similarity extends Similarity {
return 1; return 1;
} }
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>, /** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
* or returns <code>1</code> if the index does not store sumTotalTermFreq:
* any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) { protected float avgFieldLength(CollectionStatistics collectionStats) {
final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); final long sumTotalTermFreq;
if (sumTotalTermFreq <= 0) { if (collectionStats.sumTotalTermFreq() == -1) {
return 1f; // field does not exist, or stat is unsupported // frequencies are omitted (tf=1), its # of postings
if (collectionStats.sumDocFreq() == -1) {
// theoretical case only: remove!
return 1f;
}
sumTotalTermFreq = collectionStats.sumDocFreq();
} else { } else {
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); sumTotalTermFreq = collectionStats.sumTotalTermFreq();
return (float) (sumTotalTermFreq / (double) docCount);
} }
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
return (float) (sumTotalTermFreq / (double) docCount);
} }
/** /**

View File

@ -107,7 +107,7 @@ public abstract class SimilarityBase extends Similarity {
long docFreq = termStats.docFreq(); long docFreq = termStats.docFreq();
long totalTermFreq = termStats.totalTermFreq(); long totalTermFreq = termStats.totalTermFreq();
// codec does not supply totalTermFreq: substitute docFreq // frequencies are omitted, all postings have tf=1, so totalTermFreq = docFreq
if (totalTermFreq == -1) { if (totalTermFreq == -1) {
totalTermFreq = docFreq; totalTermFreq = docFreq;
} }
@ -115,18 +115,19 @@ public abstract class SimilarityBase extends Similarity {
final long numberOfFieldTokens; final long numberOfFieldTokens;
final double avgFieldLength; final double avgFieldLength;
long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (collectionStats.sumTotalTermFreq() == -1) {
// frequencies are omitted, so sumTotalTermFreq = # postings
if (sumTotalTermFreq <= 0) { if (collectionStats.sumDocFreq() == -1) {
// field does not exist; // theoretical case only: remove!
// We have to provide something if codec doesnt supply these measures, numberOfFieldTokens = docFreq;
// or if someone omitted frequencies for the field... negative values cause avgFieldLength = 1f;
// NaN/Inf for some scorers. } else {
numberOfFieldTokens = docFreq; numberOfFieldTokens = collectionStats.sumDocFreq();
avgFieldLength = 1; avgFieldLength = (float) (collectionStats.sumDocFreq() / (double)numberOfDocuments);
}
} else { } else {
numberOfFieldTokens = sumTotalTermFreq; numberOfFieldTokens = collectionStats.sumTotalTermFreq();
avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments; avgFieldLength = (float) (collectionStats.sumTotalTermFreq() / (double)numberOfDocuments);
} }
// TODO: add sumDocFreq for field (numberOfFieldPostings) // TODO: add sumDocFreq for field (numberOfFieldPostings)