LUCENE-8025: Use totalTermFreq=sumDocFreq when scoring DOCS_ONLY fields

This commit is contained in:
Robert Muir 2017-11-01 19:31:24 -04:00
parent 6eea7f70a0
commit 7495a9d75b
3 changed files with 30 additions and 20 deletions

View File

@ -36,6 +36,11 @@ Bug Fixes
refreshes or commits, and high indexing thread counts, do not
overflow an int (Mykhailo Demianenko via Mike McCandless)
* LUCENE-8025: Use totalTermFreq=sumDocFreq when scoring DOCS_ONLY fields
that omit term frequency information, as it is equivalent in that case.
Previously bogus numbers were used, and many similarities would
completely degrade. (Robert Muir, Adrien Grand)
Optimizations
* LUCENE-8018: Smaller FieldInfos memory footprint by not retaining unnecessary

View File

@ -83,17 +83,21 @@ public class BM25Similarity extends Similarity {
return 1;
}
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
* or returns <code>1</code> if the index does not store sumTotalTermFreq:
* any field that omits frequency information). */
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
protected float avgFieldLength(CollectionStatistics collectionStats) {
final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
if (sumTotalTermFreq <= 0) {
return 1f; // field does not exist, or stat is unsupported
final long sumTotalTermFreq;
if (collectionStats.sumTotalTermFreq() == -1) {
// frequencies are omitted (tf=1), its # of postings
if (collectionStats.sumDocFreq() == -1) {
// theoretical case only: remove!
return 1f;
}
sumTotalTermFreq = collectionStats.sumDocFreq();
} else {
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
return (float) (sumTotalTermFreq / (double) docCount);
sumTotalTermFreq = collectionStats.sumTotalTermFreq();
}
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
return (float) (sumTotalTermFreq / (double) docCount);
}
/**

View File

@ -107,7 +107,7 @@ public abstract class SimilarityBase extends Similarity {
long docFreq = termStats.docFreq();
long totalTermFreq = termStats.totalTermFreq();
// codec does not supply totalTermFreq: substitute docFreq
// frequencies are omitted, all postings have tf=1, so totalTermFreq = docFreq
if (totalTermFreq == -1) {
totalTermFreq = docFreq;
}
@ -115,18 +115,19 @@ public abstract class SimilarityBase extends Similarity {
final long numberOfFieldTokens;
final double avgFieldLength;
long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
if (sumTotalTermFreq <= 0) {
// field does not exist;
// We have to provide something if codec doesnt supply these measures,
// or if someone omitted frequencies for the field... negative values cause
// NaN/Inf for some scorers.
numberOfFieldTokens = docFreq;
avgFieldLength = 1;
if (collectionStats.sumTotalTermFreq() == -1) {
// frequencies are omitted, so sumTotalTermFreq = # postings
if (collectionStats.sumDocFreq() == -1) {
// theoretical case only: remove!
numberOfFieldTokens = docFreq;
avgFieldLength = 1f;
} else {
numberOfFieldTokens = collectionStats.sumDocFreq();
avgFieldLength = (float) (collectionStats.sumDocFreq() / (double)numberOfDocuments);
}
} else {
numberOfFieldTokens = sumTotalTermFreq;
avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
numberOfFieldTokens = collectionStats.sumTotalTermFreq();
avgFieldLength = (float) (collectionStats.sumTotalTermFreq() / (double)numberOfDocuments);
}
// TODO: add sumDocFreq for field (numberOfFieldPostings)