LUCENE-7524: More detailed explanation of idf.

This commit is contained in:
Adrien Grand 2016-11-10 14:08:26 +01:00
parent 74da1cff27
commit 16967ffdbb
5 changed files with 37 additions and 21 deletions

View File

@ -29,6 +29,9 @@ Improvements
now throws IllegalArgumentException instead of a cryptic exception
that closes your IndexWriter (Steve Chen via Mike McCandless)
* LUCENE-7524: Added more detailed explanation of how IDF is computed in
ClassicSimilarity and BM25Similarity. (Adrien Grand)
======================= Lucene 6.3.0 =======================
API Changes

View File

@ -175,7 +175,9 @@ public class BM25Similarity extends Similarity {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
return Explanation.match(idf, "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
Explanation.match(df, "docFreq"),
Explanation.match(docCount, "docCount"));
}
/**
@ -192,16 +194,14 @@ public class BM25Similarity extends Similarity {
* for each term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
float idf = 0.0f;
double idf = 0d; // sum into a double before casting into a float
List<Explanation> details = new ArrayList<>();
for (final TermStatistics stat : termStats ) {
final long df = stat.docFreq();
final float termIdf = idf(df, docCount);
details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
idf += termIdf;
Explanation idfExplain = idfExplain(collectionStats, stat);
details.add(idfExplain);
idf += idfExplain.getValue();
}
return Explanation.match(idf, "idf(), sum of:", details);
return Explanation.match((float) idf, "idf(), sum of:", details);
}
@Override
@ -305,7 +305,7 @@ public class BM25Similarity extends Similarity {
subs.add(Explanation.match(0, "parameter b (norms omitted for field)"));
return Explanation.match(
(freq.getValue() * (k1 + 1)) / (freq.getValue() + k1),
"tfNorm, computed from:", subs);
"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1) from:", subs);
} else {
float doclen = decodeNormValue((byte)norms.get(doc));
subs.add(Explanation.match(b, "parameter b"));
@ -313,7 +313,7 @@ public class BM25Similarity extends Similarity {
subs.add(Explanation.match(doclen, "fieldLength"));
return Explanation.match(
(freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl)),
"tfNorm, computed from:", subs);
"tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", subs);
}
}

View File

@ -18,6 +18,9 @@ package org.apache.lucene.search.similarities;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
@ -133,6 +136,16 @@ public class ClassicSimilarity extends TFIDFSimilarity {
return 1;
}
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq();
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
final float idf = idf(df, docCount);
return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
Explanation.match(df, "docFreq"),
Explanation.match(docCount, "docCount"));
}
/** Implemented as <code>log((docCount+1)/(docFreq+1)) + 1</code>. */
@Override
public float idf(long docFreq, long docCount) {

View File

@ -601,16 +601,14 @@ public abstract class TFIDFSimilarity extends Similarity {
* for each term.
*/
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
float idf = 0.0f;
double idf = 0d; // sum into a double before casting into a float
List<Explanation> subs = new ArrayList<>();
for (final TermStatistics stat : termStats ) {
final long df = stat.docFreq();
final float termIdf = idf(df, docCount);
subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
idf += termIdf;
Explanation idfExplain = idfExplain(collectionStats, stat);
subs.add(idfExplain);
idf += idfExplain.getValue();
}
return Explanation.match(idf, "idf(), sum of:", subs);
return Explanation.match((float) idf, "idf(), sum of:", subs);
}
/** Computes a score factor based on a term's document frequency (the number

View File

@ -368,8 +368,9 @@ public class CheckHits {
boolean productOf = descr.endsWith("product of:");
boolean sumOf = descr.endsWith("sum of:");
boolean maxOf = descr.endsWith("max of:");
boolean computedOf = descr.matches(".*, computed as .* from:");
boolean maxTimesOthers = false;
if (!(productOf || sumOf || maxOf)) {
if (!(productOf || sumOf || maxOf || computedOf)) {
// maybe 'max plus x times others'
int k1 = descr.indexOf("max plus ");
if (k1>=0) {
@ -387,9 +388,9 @@ public class CheckHits {
// TODO: this is a TERRIBLE assertion!!!!
Assert.assertTrue(
q+": multi valued explanation description=\""+descr
+"\" must be 'max of plus x times others' or end with 'product of'"
+"\" must be 'max of plus x times others', 'computed as x from:' or end with 'product of'"
+" or 'sum of:' or 'max of:' - "+expl,
productOf || sumOf || maxOf || maxTimesOthers);
productOf || sumOf || maxOf || computedOf || maxTimesOthers);
float sum = 0;
float product = 1;
float max = 0;
@ -410,7 +411,8 @@ public class CheckHits {
} else if (maxTimesOthers) {
combined = max + x * (sum - max);
} else {
Assert.assertTrue("should never get here!",false);
Assert.assertTrue("should never get here!", computedOf);
combined = value;
}
Assert.assertEquals(q+": actual subDetails combined=="+combined+
" != value="+value+" Explanation: "+expl,