mirror of https://github.com/apache/lucene.git
LUCENE-8025: Use totalTermFreq=sumDocFreq when scoring DOCS_ONLY fields
This commit is contained in:
parent
6eea7f70a0
commit
7495a9d75b
|
@ -36,6 +36,11 @@ Bug Fixes
|
||||||
refreshes or commits, and high indexing thread counts, do not
|
refreshes or commits, and high indexing thread counts, do not
|
||||||
overflow an int (Mykhailo Demianenko via Mike McCandless)
|
overflow an int (Mykhailo Demianenko via Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-8025: Use totalTermFreq=sumDocFreq when scoring DOCS_ONLY fields
|
||||||
|
that omit term frequency information, as it is equivalent in that case.
|
||||||
|
Previously bogus numbers were used, and many similarities would
|
||||||
|
completely degrade. (Robert Muir, Adrien Grand)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-8018: Smaller FieldInfos memory footprint by not retaining unnecessary
|
* LUCENE-8018: Smaller FieldInfos memory footprint by not retaining unnecessary
|
||||||
|
|
|
@ -83,17 +83,21 @@ public class BM25Similarity extends Similarity {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
|
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
|
||||||
* or returns <code>1</code> if the index does not store sumTotalTermFreq:
|
|
||||||
* any field that omits frequency information). */
|
|
||||||
protected float avgFieldLength(CollectionStatistics collectionStats) {
|
protected float avgFieldLength(CollectionStatistics collectionStats) {
|
||||||
final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
|
final long sumTotalTermFreq;
|
||||||
if (sumTotalTermFreq <= 0) {
|
if (collectionStats.sumTotalTermFreq() == -1) {
|
||||||
return 1f; // field does not exist, or stat is unsupported
|
// frequencies are omitted (tf=1), its # of postings
|
||||||
|
if (collectionStats.sumDocFreq() == -1) {
|
||||||
|
// theoretical case only: remove!
|
||||||
|
return 1f;
|
||||||
|
}
|
||||||
|
sumTotalTermFreq = collectionStats.sumDocFreq();
|
||||||
} else {
|
} else {
|
||||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
sumTotalTermFreq = collectionStats.sumTotalTermFreq();
|
||||||
return (float) (sumTotalTermFreq / (double) docCount);
|
|
||||||
}
|
}
|
||||||
|
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||||
|
return (float) (sumTotalTermFreq / (double) docCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -107,7 +107,7 @@ public abstract class SimilarityBase extends Similarity {
|
||||||
long docFreq = termStats.docFreq();
|
long docFreq = termStats.docFreq();
|
||||||
long totalTermFreq = termStats.totalTermFreq();
|
long totalTermFreq = termStats.totalTermFreq();
|
||||||
|
|
||||||
// codec does not supply totalTermFreq: substitute docFreq
|
// frequencies are omitted, all postings have tf=1, so totalTermFreq = docFreq
|
||||||
if (totalTermFreq == -1) {
|
if (totalTermFreq == -1) {
|
||||||
totalTermFreq = docFreq;
|
totalTermFreq = docFreq;
|
||||||
}
|
}
|
||||||
|
@ -115,18 +115,19 @@ public abstract class SimilarityBase extends Similarity {
|
||||||
final long numberOfFieldTokens;
|
final long numberOfFieldTokens;
|
||||||
final double avgFieldLength;
|
final double avgFieldLength;
|
||||||
|
|
||||||
long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
|
if (collectionStats.sumTotalTermFreq() == -1) {
|
||||||
|
// frequencies are omitted, so sumTotalTermFreq = # postings
|
||||||
if (sumTotalTermFreq <= 0) {
|
if (collectionStats.sumDocFreq() == -1) {
|
||||||
// field does not exist;
|
// theoretical case only: remove!
|
||||||
// We have to provide something if codec doesnt supply these measures,
|
numberOfFieldTokens = docFreq;
|
||||||
// or if someone omitted frequencies for the field... negative values cause
|
avgFieldLength = 1f;
|
||||||
// NaN/Inf for some scorers.
|
} else {
|
||||||
numberOfFieldTokens = docFreq;
|
numberOfFieldTokens = collectionStats.sumDocFreq();
|
||||||
avgFieldLength = 1;
|
avgFieldLength = (float) (collectionStats.sumDocFreq() / (double)numberOfDocuments);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
numberOfFieldTokens = sumTotalTermFreq;
|
numberOfFieldTokens = collectionStats.sumTotalTermFreq();
|
||||||
avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
|
avgFieldLength = (float) (collectionStats.sumTotalTermFreq() / (double)numberOfDocuments);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: add sumDocFreq for field (numberOfFieldPostings)
|
// TODO: add sumDocFreq for field (numberOfFieldPostings)
|
||||||
|
|
Loading…
Reference in New Issue