mirror of https://github.com/apache/lucene.git
LUCENE-8025: Use totalTermFreq=sumDocFreq when scoring DOCS_ONLY fields
This commit is contained in:
parent
6eea7f70a0
commit
7495a9d75b
|
@ -36,6 +36,11 @@ Bug Fixes
|
|||
refreshes or commits, and high indexing thread counts, do not
|
||||
overflow an int (Mykhailo Demianenko via Mike McCandless)
|
||||
|
||||
* LUCENE-8025: Use totalTermFreq=sumDocFreq when scoring DOCS_ONLY fields
|
||||
that omit term frequency information, as it is equivalent in that case.
|
||||
Previously bogus numbers were used, and many similarities would
|
||||
completely degrade. (Robert Muir, Adrien Grand)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-8018: Smaller FieldInfos memory footprint by not retaining unnecessary
|
||||
|
|
|
@ -83,17 +83,21 @@ public class BM25Similarity extends Similarity {
|
|||
return 1;
|
||||
}
|
||||
|
||||
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
|
||||
* or returns <code>1</code> if the index does not store sumTotalTermFreq:
|
||||
* any field that omits frequency information). */
|
||||
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
|
||||
protected float avgFieldLength(CollectionStatistics collectionStats) {
|
||||
final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
|
||||
if (sumTotalTermFreq <= 0) {
|
||||
return 1f; // field does not exist, or stat is unsupported
|
||||
final long sumTotalTermFreq;
|
||||
if (collectionStats.sumTotalTermFreq() == -1) {
|
||||
// frequencies are omitted (tf=1), its # of postings
|
||||
if (collectionStats.sumDocFreq() == -1) {
|
||||
// theoretical case only: remove!
|
||||
return 1f;
|
||||
}
|
||||
sumTotalTermFreq = collectionStats.sumDocFreq();
|
||||
} else {
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
return (float) (sumTotalTermFreq / (double) docCount);
|
||||
sumTotalTermFreq = collectionStats.sumTotalTermFreq();
|
||||
}
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
return (float) (sumTotalTermFreq / (double) docCount);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -107,7 +107,7 @@ public abstract class SimilarityBase extends Similarity {
|
|||
long docFreq = termStats.docFreq();
|
||||
long totalTermFreq = termStats.totalTermFreq();
|
||||
|
||||
// codec does not supply totalTermFreq: substitute docFreq
|
||||
// frequencies are omitted, all postings have tf=1, so totalTermFreq = docFreq
|
||||
if (totalTermFreq == -1) {
|
||||
totalTermFreq = docFreq;
|
||||
}
|
||||
|
@ -115,18 +115,19 @@ public abstract class SimilarityBase extends Similarity {
|
|||
final long numberOfFieldTokens;
|
||||
final double avgFieldLength;
|
||||
|
||||
long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
|
||||
|
||||
if (sumTotalTermFreq <= 0) {
|
||||
// field does not exist;
|
||||
// We have to provide something if codec doesnt supply these measures,
|
||||
// or if someone omitted frequencies for the field... negative values cause
|
||||
// NaN/Inf for some scorers.
|
||||
numberOfFieldTokens = docFreq;
|
||||
avgFieldLength = 1;
|
||||
if (collectionStats.sumTotalTermFreq() == -1) {
|
||||
// frequencies are omitted, so sumTotalTermFreq = # postings
|
||||
if (collectionStats.sumDocFreq() == -1) {
|
||||
// theoretical case only: remove!
|
||||
numberOfFieldTokens = docFreq;
|
||||
avgFieldLength = 1f;
|
||||
} else {
|
||||
numberOfFieldTokens = collectionStats.sumDocFreq();
|
||||
avgFieldLength = (float) (collectionStats.sumDocFreq() / (double)numberOfDocuments);
|
||||
}
|
||||
} else {
|
||||
numberOfFieldTokens = sumTotalTermFreq;
|
||||
avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
|
||||
numberOfFieldTokens = collectionStats.sumTotalTermFreq();
|
||||
avgFieldLength = (float) (collectionStats.sumTotalTermFreq() / (double)numberOfDocuments);
|
||||
}
|
||||
|
||||
// TODO: add sumDocFreq for field (numberOfFieldPostings)
|
||||
|
|
Loading…
Reference in New Issue