diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9979eb9d7b7..3edf6f9338f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -29,6 +29,9 @@ Improvements now throws IllegalArgumentException instead of a cryptic exception that closes your IndexWriter (Steve Chen via Mike McCandless) +* LUCENE-7524: Added more detailed explanation of how IDF is computed in + ClassicSimilarity and BM25Similarity. (Adrien Grand) + ======================= Lucene 6.3.0 ======================= API Changes diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index 99f76ef9956..11f4eded87e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -175,7 +175,9 @@ public class BM25Similarity extends Similarity { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); - return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"); + return Explanation.match(idf, "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", + Explanation.match(df, "docFreq"), + Explanation.match(docCount, "docCount")); } /** @@ -192,16 +194,14 @@ public class BM25Similarity extends Similarity { * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { - final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); - float idf = 0.0f; + double idf = 0d; // sum into a double before casting into a float List details = new ArrayList<>(); for (final TermStatistics stat : termStats ) { - final long df = stat.docFreq(); - final float termIdf = idf(df, docCount); - details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")")); - idf += termIdf; + Explanation idfExplain = idfExplain(collectionStats, stat); + details.add(idfExplain); + idf += idfExplain.getValue(); } - return Explanation.match(idf, "idf(), sum of:", details); + return Explanation.match((float) idf, "idf(), sum of:", details); } @Override @@ -305,7 +305,7 @@ public class BM25Similarity extends Similarity { subs.add(Explanation.match(0, "parameter b (norms omitted for field)")); return Explanation.match( (freq.getValue() * (k1 + 1)) / (freq.getValue() + k1), - "tfNorm, computed from:", subs); + "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1) from:", subs); } else { float doclen = decodeNormValue((byte)norms.get(doc)); subs.add(Explanation.match(b, "parameter b")); @@ -313,7 +313,7 @@ public class BM25Similarity extends Similarity { subs.add(Explanation.match(doclen, "fieldLength")); return Explanation.match( (freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl)), - "tfNorm, computed from:", subs); + "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", subs); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java index 3b6cbddbe14..5f5b6425b8e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java @@ -18,6 +18,9 @@ package org.apache.lucene.search.similarities; import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.TermStatistics; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.SmallFloat; @@ -133,6 +136,16 @@ public class ClassicSimilarity extends TFIDFSimilarity { return 1; } + @Override + public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { + final long df = termStats.docFreq(); + final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); + final float idf = idf(df, docCount); + return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:", + Explanation.match(df, "docFreq"), + Explanation.match(docCount, "docCount")); + } + /** Implemented as log((docCount+1)/(docFreq+1)) + 1. */ @Override public float idf(long docFreq, long docCount) { diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java index a6b1ee5168c..774c940b231 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java @@ -601,16 +601,14 @@ public abstract class TFIDFSimilarity extends Similarity { * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { - final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); - float idf = 0.0f; + double idf = 0d; // sum into a double before casting into a float List subs = new ArrayList<>(); for (final TermStatistics stat : termStats ) { - final long df = stat.docFreq(); - final float termIdf = idf(df, docCount); - subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")")); - idf += termIdf; + Explanation idfExplain = idfExplain(collectionStats, stat); + subs.add(idfExplain); + idf += idfExplain.getValue(); } - return Explanation.match(idf, "idf(), sum of:", subs); + return Explanation.match((float) idf, "idf(), sum of:", subs); } /** Computes a score factor based on a term's document frequency (the number diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java b/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java index 9a14b5daa73..dee7d8405c0 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java @@ -368,8 +368,9 @@ public class CheckHits { boolean productOf = descr.endsWith("product of:"); boolean sumOf = descr.endsWith("sum of:"); boolean maxOf = descr.endsWith("max of:"); + boolean computedOf = descr.matches(".*, computed as .* from:"); boolean maxTimesOthers = false; - if (!(productOf || sumOf || maxOf)) { + if (!(productOf || sumOf || maxOf || computedOf)) { // maybe 'max plus x times others' int k1 = descr.indexOf("max plus "); if (k1>=0) { @@ -387,9 +388,9 @@ public class CheckHits { // TODO: this is a TERRIBLE assertion!!!! Assert.assertTrue( q+": multi valued explanation description=\""+descr - +"\" must be 'max of plus x times others' or end with 'product of'" + +"\" must be 'max of plus x times others', 'computed as x from:' or end with 'product of'" +" or 'sum of:' or 'max of:' - "+expl, - productOf || sumOf || maxOf || maxTimesOthers); + productOf || sumOf || maxOf || computedOf || maxTimesOthers); float sum = 0; float product = 1; float max = 0; @@ -410,7 +411,8 @@ public class CheckHits { } else if (maxTimesOthers) { combined = max + x * (sum - max); } else { - Assert.assertTrue("should never get here!",false); + Assert.assertTrue("should never get here!", computedOf); + combined = value; } Assert.assertEquals(q+": actual subDetails combined=="+combined+ " != value="+value+" Explanation: "+expl,