mirror of https://github.com/apache/lucene.git
LUCENE-6711: Use CollectionStatistics.docCount() for IDF and average field length computations
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1695744 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1e0e15af6c
commit
9dc862147e
|
@ -19,6 +19,10 @@ New Features
|
|||
for counting ranges that align with the underlying terms as defined by the
|
||||
NumberRangePrefixTree (e.g. familiar date units like days). (David Smiley)
|
||||
|
||||
* LUCENE-6711: Use CollectionStatistics.docCount() for IDF and average field
|
||||
length computations, to avoid skew from documents that don't have the field.
|
||||
(Ahmet Arslan via Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-3312: The API of oal.document was restructured to
|
||||
|
|
|
@ -1,5 +1,22 @@
|
|||
# Apache Lucene Migration Guide
|
||||
|
||||
## The way how number of document calculated is changed (LUCENE-6711)
|
||||
The number of documents (numDocs) is used to calculate term specificity (idf) and average document length (avdl).
|
||||
Prior to LUCENE-6711, collectionStats.maxDoc() was used for the statistics.
|
||||
Now, collectionStats.docCount() is used whenever possible, if not maxDocs() is used.
|
||||
|
||||
Assume that a collection contains 100 documents, and 50 of them have "keywords" field.
|
||||
In this example, maxDocs is 100 while docCount is 50 for the "keywords" field.
|
||||
The total number of tokens for "keywords" field is divided by docCount to obtain avdl.
|
||||
Therefore, docCount which is the total number of documents that have at least one term for the field, is a more precise metric for optional fields.
|
||||
|
||||
DefaultSimilarity does not leverage avdl, so this change would have relatively minor change in the result list.
|
||||
Because relative idf values of terms will remain same.
|
||||
However, when combined with other factors such as term frequency, relative ranking of documents could change.
|
||||
Some Similarity implementations (such as the ones instantiated with NormalizationH2 and BM25) take account into avdl and would have notable change in ranked list.
|
||||
Especially if you have a collection of documents with varying lengths.
|
||||
Because NormalizationH2 tends to punish documents longer than avdl.
|
||||
|
||||
## Separation of IndexDocument and StoredDocument (LUCENE-3312)
|
||||
|
||||
The API of oal.document was restructured to differentiate between stored
|
||||
|
|
|
@ -63,9 +63,9 @@ public class BM25Similarity extends Similarity {
|
|||
this.b = 0.75f;
|
||||
}
|
||||
|
||||
/** Implemented as <code>log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5))</code>. */
|
||||
protected float idf(long docFreq, long numDocs) {
|
||||
return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D));
|
||||
/** Implemented as <code>log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))</code>. */
|
||||
protected float idf(long docFreq, long docCount) {
|
||||
return (float) Math.log(1 + (docCount - docFreq + 0.5D)/(docFreq + 0.5D));
|
||||
}
|
||||
|
||||
/** Implemented as <code>1 / (distance + 1)</code>. */
|
||||
|
@ -78,7 +78,7 @@ public class BM25Similarity extends Similarity {
|
|||
return 1;
|
||||
}
|
||||
|
||||
/** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>,
|
||||
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
|
||||
* or returns <code>1</code> if the index does not store sumTotalTermFreq:
|
||||
* any field that omits frequency information). */
|
||||
protected float avgFieldLength(CollectionStatistics collectionStats) {
|
||||
|
@ -86,7 +86,8 @@ public class BM25Similarity extends Similarity {
|
|||
if (sumTotalTermFreq <= 0) {
|
||||
return 1f; // field does not exist, or stat is unsupported
|
||||
} else {
|
||||
return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc());
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
return (float) (sumTotalTermFreq / (double) docCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -150,14 +151,14 @@ public class BM25Similarity extends Similarity {
|
|||
* The default implementation uses:
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* idf(docFreq, searcher.maxDoc());
|
||||
* idf(docFreq, docCount);
|
||||
* </pre>
|
||||
*
|
||||
* Note that {@link CollectionStatistics#maxDoc()} is used instead of
|
||||
* Note that {@link CollectionStatistics#docCount()} is used instead of
|
||||
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
|
||||
* {@link TermStatistics#docFreq()} is used, and when the latter
|
||||
* is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction.
|
||||
* In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute
|
||||
* is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
|
||||
* In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
|
||||
*
|
||||
* @param collectionStats collection-level statistics
|
||||
* @param termStats term-level statistics for the term
|
||||
|
@ -166,9 +167,9 @@ public class BM25Similarity extends Similarity {
|
|||
*/
|
||||
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
|
||||
final long df = termStats.docFreq();
|
||||
final long max = collectionStats.maxDoc();
|
||||
final float idf = idf(df, max);
|
||||
return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
final float idf = idf(df, docCount);
|
||||
return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -185,13 +186,13 @@ public class BM25Similarity extends Similarity {
|
|||
* for each term.
|
||||
*/
|
||||
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
|
||||
final long max = collectionStats.maxDoc();
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
float idf = 0.0f;
|
||||
List<Explanation> details = new ArrayList<>();
|
||||
for (final TermStatistics stat : termStats ) {
|
||||
final long df = stat.docFreq();
|
||||
final float termIdf = idf(df, max);
|
||||
details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
|
||||
final float termIdf = idf(df, docCount);
|
||||
details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
|
||||
idf += termIdf;
|
||||
}
|
||||
return Explanation.match(idf, "idf(), sum of:", details);
|
||||
|
|
|
@ -133,10 +133,10 @@ public class DefaultSimilarity extends TFIDFSimilarity {
|
|||
return 1;
|
||||
}
|
||||
|
||||
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
|
||||
/** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
|
||||
@Override
|
||||
public float idf(long docFreq, long numDocs) {
|
||||
return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
|
||||
public float idf(long docFreq, long docCount) {
|
||||
return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -102,7 +102,7 @@ public abstract class SimilarityBase extends Similarity {
|
|||
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
|
||||
// #positions(field) must be >= #positions(term)
|
||||
assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq();
|
||||
long numberOfDocuments = collectionStats.maxDoc();
|
||||
long numberOfDocuments = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
|
||||
long docFreq = termStats.docFreq();
|
||||
long totalTermFreq = termStats.totalTermFreq();
|
||||
|
|
|
@ -346,7 +346,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
* </td>
|
||||
* <td valign="middle" align="center">
|
||||
* <table summary="inverse document frequency computation">
|
||||
* <tr><td align="center" style="text-align: center"><small>numDocs</small></td></tr>
|
||||
* <tr><td align="center" style="text-align: center"><small>docCount</small></td></tr>
|
||||
* <tr><td align="center" style="text-align: center">–––––––––</td></tr>
|
||||
* <tr><td align="center" style="text-align: center"><small>docFreq+1</small></td></tr>
|
||||
* </table>
|
||||
|
@ -566,14 +566,14 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
* The default implementation uses:
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* idf(docFreq, searcher.maxDoc());
|
||||
* idf(docFreq, docCount);
|
||||
* </pre>
|
||||
*
|
||||
* Note that {@link CollectionStatistics#maxDoc()} is used instead of
|
||||
* Note that {@link CollectionStatistics#docCount()} is used instead of
|
||||
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
|
||||
* {@link TermStatistics#docFreq()} is used, and when the latter
|
||||
* is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction.
|
||||
* In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute
|
||||
* is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
|
||||
* In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
|
||||
*
|
||||
* @param collectionStats collection-level statistics
|
||||
* @param termStats term-level statistics for the term
|
||||
|
@ -582,9 +582,9 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
*/
|
||||
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
|
||||
final long df = termStats.docFreq();
|
||||
final long max = collectionStats.maxDoc();
|
||||
final float idf = idf(df, max);
|
||||
return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
final float idf = idf(df, docCount);
|
||||
return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -601,13 +601,13 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
* for each term.
|
||||
*/
|
||||
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
|
||||
final long max = collectionStats.maxDoc();
|
||||
final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
|
||||
float idf = 0.0f;
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
for (final TermStatistics stat : termStats ) {
|
||||
final long df = stat.docFreq();
|
||||
final float termIdf = idf(df, max);
|
||||
subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
|
||||
final float termIdf = idf(df, docCount);
|
||||
subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
|
||||
idf += termIdf;
|
||||
}
|
||||
return Explanation.match(idf, "idf(), sum of:", subs);
|
||||
|
@ -623,10 +623,10 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
* and smaller values for common terms.
|
||||
*
|
||||
* @param docFreq the number of documents which contain the term
|
||||
* @param numDocs the total number of documents in the collection
|
||||
* @param docCount the total number of documents in the collection
|
||||
* @return a score factor based on the term's document frequency
|
||||
*/
|
||||
public abstract float idf(long docFreq, long numDocs);
|
||||
public abstract float idf(long docFreq, long docCount);
|
||||
|
||||
/**
|
||||
* Compute an index-time normalization value for this field instance.
|
||||
|
|
|
@ -118,7 +118,7 @@ public class TestMaxTermFrequency extends LuceneTestCase {
|
|||
@Override public float coord(int overlap, int maxOverlap) { return 0; }
|
||||
@Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
|
||||
@Override public float tf(float freq) { return 0; }
|
||||
@Override public float idf(long docFreq, long numDocs) { return 0; }
|
||||
@Override public float idf(long docFreq, long docCount) { return 0; }
|
||||
@Override public float sloppyFreq(int distance) { return 0; }
|
||||
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
|
||||
}
|
||||
|
|
|
@ -67,7 +67,7 @@ public class TestNorms extends LuceneTestCase {
|
|||
@Override public float coord(int overlap, int maxOverlap) { return 0; }
|
||||
@Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
|
||||
@Override public float tf(float freq) { return 0; }
|
||||
@Override public float idf(long docFreq, long numDocs) { return 0; }
|
||||
@Override public float idf(long docFreq, long docCount) { return 0; }
|
||||
@Override public float sloppyFreq(int distance) { return 0; }
|
||||
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
|
||||
}
|
||||
|
|
|
@ -54,7 +54,7 @@ public class TestOmitTf extends LuceneTestCase {
|
|||
@Override public float lengthNorm(FieldInvertState state) { return state.getBoost(); }
|
||||
@Override public float tf(float freq) { return freq; }
|
||||
@Override public float sloppyFreq(int distance) { return 2.0f; }
|
||||
@Override public float idf(long docFreq, long numDocs) { return 1.0f; }
|
||||
@Override public float idf(long docFreq, long docCount) { return 1.0f; }
|
||||
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
|
||||
return Explanation.match(1.0f, "Inexplicable");
|
||||
}
|
||||
|
|
|
@ -80,7 +80,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public float idf(long docFreq, long numDocs) {
|
||||
public float idf(long docFreq, long docCount) {
|
||||
return 1.0f;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,7 +46,7 @@ public class TestSimilarity extends LuceneTestCase {
|
|||
@Override public float lengthNorm(FieldInvertState state) { return state.getBoost(); }
|
||||
@Override public float tf(float freq) { return freq; }
|
||||
@Override public float sloppyFreq(int distance) { return 2.0f; }
|
||||
@Override public float idf(long docFreq, long numDocs) { return 1.0f; }
|
||||
@Override public float idf(long docFreq, long docCount) { return 1.0f; }
|
||||
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] stats) {
|
||||
return Explanation.match(1.0f, "Inexplicable");
|
||||
}
|
||||
|
|
|
@ -142,7 +142,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public float idf(long docFreq, long numDocs) {
|
||||
public float idf(long docFreq, long docCount) {
|
||||
return 1f;
|
||||
}
|
||||
|
||||
|
@ -190,7 +190,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public float idf(long docFreq, long numDocs) {
|
||||
public float idf(long docFreq, long docCount) {
|
||||
return 10f;
|
||||
}
|
||||
|
||||
|
|
|
@ -288,7 +288,7 @@ public class TestPayloadTermQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public float idf(long docFreq, long numDocs) {
|
||||
public float idf(long docFreq, long docCount) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -38,6 +38,7 @@ import org.apache.lucene.search.spans.SpanOrQuery;
|
|||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Tests against all the similarities we have
|
||||
|
@ -160,6 +161,48 @@ public class TestSimilarity2 extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
/** make sure scores are not skewed by docs not containing the field */
|
||||
public void testNoFieldSkew() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("foo", "bar baz somethingelse", Field.Store.NO));
|
||||
iw.addDocument(doc);
|
||||
IndexReader ir = iw.getReader();
|
||||
IndexSearcher is = newSearcher(ir);
|
||||
|
||||
BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
|
||||
queryBuilder.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD);
|
||||
queryBuilder.add(new TermQuery(new Term("foo", "baz")), BooleanClause.Occur.SHOULD);
|
||||
Query query = queryBuilder.build();
|
||||
|
||||
// collect scores
|
||||
List<Float> scores = new ArrayList<>();
|
||||
for (Similarity sim : sims) {
|
||||
is.setSimilarity(sim);
|
||||
scores.add(is.explain(query, 0).getValue());
|
||||
}
|
||||
ir.close();
|
||||
|
||||
// add some additional docs without the field
|
||||
int numExtraDocs = TestUtil.nextInt(random(), 1, 1000);
|
||||
for (int i = 0; i < numExtraDocs; i++) {
|
||||
iw.addDocument(new Document());
|
||||
}
|
||||
|
||||
// check scores are the same
|
||||
ir = iw.getReader();
|
||||
is = newSearcher(ir);
|
||||
for (int i = 0; i < sims.size(); i++) {
|
||||
is.setSimilarity(sims.get(i));
|
||||
assertEquals(scores.get(i).floatValue(), is.explain(query, 0).getValue(), 0F);
|
||||
}
|
||||
|
||||
iw.close();
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
/** make sure all sims work if TF is omitted */
|
||||
public void testOmitTF() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
|
|
|
@ -203,10 +203,10 @@ class PreciseDefaultSimilarity extends TFIDFSimilarity {
|
|||
return 1;
|
||||
}
|
||||
|
||||
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */
|
||||
/** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
|
||||
@Override
|
||||
public float idf(long docFreq, long numDocs) {
|
||||
return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0);
|
||||
public float idf(long docFreq, long docCount) {
|
||||
return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -566,8 +566,8 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
|
|||
}
|
||||
|
||||
public void testAliasingBoost() throws Exception {
|
||||
assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias", "f.myalias.qf","name trait_ss^0.5"), "//result/doc[1]/str[@name='id']=42", "//result/doc[2]/str[@name='id']=47");//doc 42 should score higher than 46
|
||||
assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias^100 name", "f.myalias.qf","trait_ss^0.5"), "//result/doc[1]/str[@name='id']=47", "//result/doc[2]/str[@name='id']=42");//Now the order should be inverse
|
||||
assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias", "f.myalias.qf","name trait_ss^0.1"), "//result/doc[1]/str[@name='id']=42", "//result/doc[2]/str[@name='id']=47");//doc 42 should score higher than 46
|
||||
assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias^100 name", "f.myalias.qf","trait_ss^0.1"), "//result/doc[1]/str[@name='id']=47", "//result/doc[2]/str[@name='id']=42");//Now the order should be inverse
|
||||
}
|
||||
|
||||
public void testCyclicAliasing() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue