mirror of https://github.com/apache/lucene.git
LUCENE-7524: More detailed explanation of idf.
This commit is contained in:
parent
74da1cff27
commit
16967ffdbb
|
@ -29,6 +29,9 @@ Improvements
|
|||
now throws IllegalArgumentException instead of a cryptic exception
|
||||
that closes your IndexWriter (Steve Chen via Mike McCandless)
|
||||
|
||||
* LUCENE-7524: Added more detailed explanation of how IDF is computed in
|
||||
ClassicSimilarity and BM25Similarity. (Adrien Grand)
|
||||
|
||||
======================= Lucene 6.3.0 =======================
|
||||
|
||||
API Changes
|
||||
|
|
|
@ -175,7 +175,9 @@ public class BM25Similarity extends Similarity {
|
|||
final long df = termStats.docFreq();
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
final float idf = idf(df, docCount);
|
||||
return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
|
||||
return Explanation.match(idf, "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
|
||||
Explanation.match(df, "docFreq"),
|
||||
Explanation.match(docCount, "docCount"));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -192,16 +194,14 @@ public class BM25Similarity extends Similarity {
|
|||
* for each term.
|
||||
*/
|
||||
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
float idf = 0.0f;
|
||||
double idf = 0d; // sum into a double before casting into a float
|
||||
List<Explanation> details = new ArrayList<>();
|
||||
for (final TermStatistics stat : termStats ) {
|
||||
final long df = stat.docFreq();
|
||||
final float termIdf = idf(df, docCount);
|
||||
details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
|
||||
idf += termIdf;
|
||||
Explanation idfExplain = idfExplain(collectionStats, stat);
|
||||
details.add(idfExplain);
|
||||
idf += idfExplain.getValue();
|
||||
}
|
||||
return Explanation.match(idf, "idf(), sum of:", details);
|
||||
return Explanation.match((float) idf, "idf(), sum of:", details);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -305,7 +305,7 @@ public class BM25Similarity extends Similarity {
|
|||
subs.add(Explanation.match(0, "parameter b (norms omitted for field)"));
|
||||
return Explanation.match(
|
||||
(freq.getValue() * (k1 + 1)) / (freq.getValue() + k1),
|
||||
"tfNorm, computed from:", subs);
|
||||
"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1) from:", subs);
|
||||
} else {
|
||||
float doclen = decodeNormValue((byte)norms.get(doc));
|
||||
subs.add(Explanation.match(b, "parameter b"));
|
||||
|
@ -313,7 +313,7 @@ public class BM25Similarity extends Similarity {
|
|||
subs.add(Explanation.match(doclen, "fieldLength"));
|
||||
return Explanation.match(
|
||||
(freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl)),
|
||||
"tfNorm, computed from:", subs);
|
||||
"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", subs);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -18,6 +18,9 @@ package org.apache.lucene.search.similarities;
|
|||
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
|
||||
|
@ -133,6 +136,16 @@ public class ClassicSimilarity extends TFIDFSimilarity {
|
|||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
|
||||
final long df = termStats.docFreq();
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
final float idf = idf(df, docCount);
|
||||
return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
|
||||
Explanation.match(df, "docFreq"),
|
||||
Explanation.match(docCount, "docCount"));
|
||||
}
|
||||
|
||||
/** Implemented as <code>log((docCount+1)/(docFreq+1)) + 1</code>. */
|
||||
@Override
|
||||
public float idf(long docFreq, long docCount) {
|
||||
|
|
|
@ -601,16 +601,14 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
* for each term.
|
||||
*/
|
||||
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
float idf = 0.0f;
|
||||
double idf = 0d; // sum into a double before casting into a float
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
for (final TermStatistics stat : termStats ) {
|
||||
final long df = stat.docFreq();
|
||||
final float termIdf = idf(df, docCount);
|
||||
subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
|
||||
idf += termIdf;
|
||||
Explanation idfExplain = idfExplain(collectionStats, stat);
|
||||
subs.add(idfExplain);
|
||||
idf += idfExplain.getValue();
|
||||
}
|
||||
return Explanation.match(idf, "idf(), sum of:", subs);
|
||||
return Explanation.match((float) idf, "idf(), sum of:", subs);
|
||||
}
|
||||
|
||||
/** Computes a score factor based on a term's document frequency (the number
|
||||
|
|
|
@ -368,8 +368,9 @@ public class CheckHits {
|
|||
boolean productOf = descr.endsWith("product of:");
|
||||
boolean sumOf = descr.endsWith("sum of:");
|
||||
boolean maxOf = descr.endsWith("max of:");
|
||||
boolean computedOf = descr.matches(".*, computed as .* from:");
|
||||
boolean maxTimesOthers = false;
|
||||
if (!(productOf || sumOf || maxOf)) {
|
||||
if (!(productOf || sumOf || maxOf || computedOf)) {
|
||||
// maybe 'max plus x times others'
|
||||
int k1 = descr.indexOf("max plus ");
|
||||
if (k1>=0) {
|
||||
|
@ -387,9 +388,9 @@ public class CheckHits {
|
|||
// TODO: this is a TERRIBLE assertion!!!!
|
||||
Assert.assertTrue(
|
||||
q+": multi valued explanation description=\""+descr
|
||||
+"\" must be 'max of plus x times others' or end with 'product of'"
|
||||
+"\" must be 'max of plus x times others', 'computed as x from:' or end with 'product of'"
|
||||
+" or 'sum of:' or 'max of:' - "+expl,
|
||||
productOf || sumOf || maxOf || maxTimesOthers);
|
||||
productOf || sumOf || maxOf || computedOf || maxTimesOthers);
|
||||
float sum = 0;
|
||||
float product = 1;
|
||||
float max = 0;
|
||||
|
@ -410,7 +411,8 @@ public class CheckHits {
|
|||
} else if (maxTimesOthers) {
|
||||
combined = max + x * (sum - max);
|
||||
} else {
|
||||
Assert.assertTrue("should never get here!",false);
|
||||
Assert.assertTrue("should never get here!", computedOf);
|
||||
combined = value;
|
||||
}
|
||||
Assert.assertEquals(q+": actual subDetails combined=="+combined+
|
||||
" != value="+value+" Explanation: "+expl,
|
||||
|
|
Loading…
Reference in New Issue