LUCENE-6711: Use CollectionStatistics.docCount() for IDF and average field length computations

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1695744 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2015-08-13 17:37:15 +00:00
parent 1e0e15af6c
commit 9dc862147e
16 changed files with 110 additions and 45 deletions

View File

@ -19,6 +19,10 @@ New Features
for counting ranges that align with the underlying terms as defined by the for counting ranges that align with the underlying terms as defined by the
NumberRangePrefixTree (e.g. familiar date units like days). (David Smiley) NumberRangePrefixTree (e.g. familiar date units like days). (David Smiley)
* LUCENE-6711: Use CollectionStatistics.docCount() for IDF and average field
length computations, to avoid skew from documents that don't have the field.
(Ahmet Arslan via Robert Muir)
API Changes API Changes
* LUCENE-3312: The API of oal.document was restructured to * LUCENE-3312: The API of oal.document was restructured to

View File

@ -1,5 +1,22 @@
# Apache Lucene Migration Guide # Apache Lucene Migration Guide
## The way how number of document calculated is changed (LUCENE-6711)
The number of documents (numDocs) is used to calculate term specificity (idf) and average document length (avdl).
Prior to LUCENE-6711, collectionStats.maxDoc() was used for the statistics.
Now, collectionStats.docCount() is used whenever possible, if not maxDocs() is used.
Assume that a collection contains 100 documents, and 50 of them have "keywords" field.
In this example, maxDocs is 100 while docCount is 50 for the "keywords" field.
The total number of tokens for "keywords" field is divided by docCount to obtain avdl.
Therefore, docCount which is the total number of documents that have at least one term for the field, is a more precise metric for optional fields.
DefaultSimilarity does not leverage avdl, so this change would have relatively minor change in the result list.
Because relative idf values of terms will remain same.
However, when combined with other factors such as term frequency, relative ranking of documents could change.
Some Similarity implementations (such as the ones instantiated with NormalizationH2 and BM25) take account into avdl and would have notable change in ranked list.
Especially if you have a collection of documents with varying lengths.
Because NormalizationH2 tends to punish documents longer than avdl.
## Separation of IndexDocument and StoredDocument (LUCENE-3312) ## Separation of IndexDocument and StoredDocument (LUCENE-3312)
The API of oal.document was restructured to differentiate between stored The API of oal.document was restructured to differentiate between stored

View File

@ -63,9 +63,9 @@ public class BM25Similarity extends Similarity {
this.b = 0.75f; this.b = 0.75f;
} }
/** Implemented as <code>log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5))</code>. */ /** Implemented as <code>log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))</code>. */
protected float idf(long docFreq, long numDocs) { protected float idf(long docFreq, long docCount) {
return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D)); return (float) Math.log(1 + (docCount - docFreq + 0.5D)/(docFreq + 0.5D));
} }
/** Implemented as <code>1 / (distance + 1)</code>. */ /** Implemented as <code>1 / (distance + 1)</code>. */
@ -78,7 +78,7 @@ public class BM25Similarity extends Similarity {
return 1; return 1;
} }
/** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>, /** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
* or returns <code>1</code> if the index does not store sumTotalTermFreq: * or returns <code>1</code> if the index does not store sumTotalTermFreq:
* any field that omits frequency information). */ * any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) { protected float avgFieldLength(CollectionStatistics collectionStats) {
@ -86,7 +86,8 @@ public class BM25Similarity extends Similarity {
if (sumTotalTermFreq <= 0) { if (sumTotalTermFreq <= 0) {
return 1f; // field does not exist, or stat is unsupported return 1f; // field does not exist, or stat is unsupported
} else { } else {
return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc()); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
return (float) (sumTotalTermFreq / (double) docCount);
} }
} }
@ -150,14 +151,14 @@ public class BM25Similarity extends Similarity {
* The default implementation uses: * The default implementation uses:
* *
* <pre class="prettyprint"> * <pre class="prettyprint">
* idf(docFreq, searcher.maxDoc()); * idf(docFreq, docCount);
* </pre> * </pre>
* *
* Note that {@link CollectionStatistics#maxDoc()} is used instead of * Note that {@link CollectionStatistics#docCount()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link TermStatistics#docFreq()} is used, and when the latter * {@link TermStatistics#docFreq()} is used, and when the latter
* is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction. * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
* In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
* *
* @param collectionStats collection-level statistics * @param collectionStats collection-level statistics
* @param termStats term-level statistics for the term * @param termStats term-level statistics for the term
@ -166,9 +167,9 @@ public class BM25Similarity extends Similarity {
*/ */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq(); final long df = termStats.docFreq();
final long max = collectionStats.maxDoc(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
final float idf = idf(df, max); final float idf = idf(df, docCount);
return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
} }
/** /**
@ -185,13 +186,13 @@ public class BM25Similarity extends Similarity {
* for each term. * for each term.
*/ */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
final long max = collectionStats.maxDoc(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
float idf = 0.0f; float idf = 0.0f;
List<Explanation> details = new ArrayList<>(); List<Explanation> details = new ArrayList<>();
for (final TermStatistics stat : termStats ) { for (final TermStatistics stat : termStats ) {
final long df = stat.docFreq(); final long df = stat.docFreq();
final float termIdf = idf(df, max); final float termIdf = idf(df, docCount);
details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
idf += termIdf; idf += termIdf;
} }
return Explanation.match(idf, "idf(), sum of:", details); return Explanation.match(idf, "idf(), sum of:", details);

View File

@ -133,10 +133,10 @@ public class DefaultSimilarity extends TFIDFSimilarity {
return 1; return 1;
} }
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */ /** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
@Override @Override
public float idf(long docFreq, long numDocs) { public float idf(long docFreq, long docCount) {
return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
} }
/** /**

View File

@ -102,7 +102,7 @@ public abstract class SimilarityBase extends Similarity {
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
// #positions(field) must be >= #positions(term) // #positions(field) must be >= #positions(term)
assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq(); assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq();
long numberOfDocuments = collectionStats.maxDoc(); long numberOfDocuments = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
long docFreq = termStats.docFreq(); long docFreq = termStats.docFreq();
long totalTermFreq = termStats.totalTermFreq(); long totalTermFreq = termStats.totalTermFreq();

View File

@ -346,7 +346,7 @@ import org.apache.lucene.util.BytesRef;
* </td> * </td>
* <td valign="middle" align="center"> * <td valign="middle" align="center">
* <table summary="inverse document frequency computation"> * <table summary="inverse document frequency computation">
* <tr><td align="center" style="text-align: center"><small>numDocs</small></td></tr> * <tr><td align="center" style="text-align: center"><small>docCount</small></td></tr>
* <tr><td align="center" style="text-align: center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr> * <tr><td align="center" style="text-align: center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center" style="text-align: center"><small>docFreq+1</small></td></tr> * <tr><td align="center" style="text-align: center"><small>docFreq+1</small></td></tr>
* </table> * </table>
@ -566,14 +566,14 @@ public abstract class TFIDFSimilarity extends Similarity {
* The default implementation uses: * The default implementation uses:
* *
* <pre class="prettyprint"> * <pre class="prettyprint">
* idf(docFreq, searcher.maxDoc()); * idf(docFreq, docCount);
* </pre> * </pre>
* *
* Note that {@link CollectionStatistics#maxDoc()} is used instead of * Note that {@link CollectionStatistics#docCount()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link TermStatistics#docFreq()} is used, and when the latter * {@link TermStatistics#docFreq()} is used, and when the latter
* is inaccurate, so is {@link CollectionStatistics#maxDoc()}, and in the same direction. * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
* In addition, {@link CollectionStatistics#maxDoc()} is more efficient to compute * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
* *
* @param collectionStats collection-level statistics * @param collectionStats collection-level statistics
* @param termStats term-level statistics for the term * @param termStats term-level statistics for the term
@ -582,9 +582,9 @@ public abstract class TFIDFSimilarity extends Similarity {
*/ */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
final long df = termStats.docFreq(); final long df = termStats.docFreq();
final long max = collectionStats.maxDoc(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
final float idf = idf(df, max); final float idf = idf(df, docCount);
return Explanation.match(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
} }
/** /**
@ -601,13 +601,13 @@ public abstract class TFIDFSimilarity extends Similarity {
* for each term. * for each term.
*/ */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
final long max = collectionStats.maxDoc(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
float idf = 0.0f; float idf = 0.0f;
List<Explanation> subs = new ArrayList<>(); List<Explanation> subs = new ArrayList<>();
for (final TermStatistics stat : termStats ) { for (final TermStatistics stat : termStats ) {
final long df = stat.docFreq(); final long df = stat.docFreq();
final float termIdf = idf(df, max); final float termIdf = idf(df, docCount);
subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
idf += termIdf; idf += termIdf;
} }
return Explanation.match(idf, "idf(), sum of:", subs); return Explanation.match(idf, "idf(), sum of:", subs);
@ -623,10 +623,10 @@ public abstract class TFIDFSimilarity extends Similarity {
* and smaller values for common terms. * and smaller values for common terms.
* *
* @param docFreq the number of documents which contain the term * @param docFreq the number of documents which contain the term
* @param numDocs the total number of documents in the collection * @param docCount the total number of documents in the collection
* @return a score factor based on the term's document frequency * @return a score factor based on the term's document frequency
*/ */
public abstract float idf(long docFreq, long numDocs); public abstract float idf(long docFreq, long docCount);
/** /**
* Compute an index-time normalization value for this field instance. * Compute an index-time normalization value for this field instance.

View File

@ -118,7 +118,7 @@ public class TestMaxTermFrequency extends LuceneTestCase {
@Override public float coord(int overlap, int maxOverlap) { return 0; } @Override public float coord(int overlap, int maxOverlap) { return 0; }
@Override public float queryNorm(float sumOfSquaredWeights) { return 0; } @Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
@Override public float tf(float freq) { return 0; } @Override public float tf(float freq) { return 0; }
@Override public float idf(long docFreq, long numDocs) { return 0; } @Override public float idf(long docFreq, long docCount) { return 0; }
@Override public float sloppyFreq(int distance) { return 0; } @Override public float sloppyFreq(int distance) { return 0; }
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; } @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
} }

View File

@ -67,7 +67,7 @@ public class TestNorms extends LuceneTestCase {
@Override public float coord(int overlap, int maxOverlap) { return 0; } @Override public float coord(int overlap, int maxOverlap) { return 0; }
@Override public float queryNorm(float sumOfSquaredWeights) { return 0; } @Override public float queryNorm(float sumOfSquaredWeights) { return 0; }
@Override public float tf(float freq) { return 0; } @Override public float tf(float freq) { return 0; }
@Override public float idf(long docFreq, long numDocs) { return 0; } @Override public float idf(long docFreq, long docCount) { return 0; }
@Override public float sloppyFreq(int distance) { return 0; } @Override public float sloppyFreq(int distance) { return 0; }
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; } @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
} }

View File

@ -54,7 +54,7 @@ public class TestOmitTf extends LuceneTestCase {
@Override public float lengthNorm(FieldInvertState state) { return state.getBoost(); } @Override public float lengthNorm(FieldInvertState state) { return state.getBoost(); }
@Override public float tf(float freq) { return freq; } @Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; } @Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(long docFreq, long numDocs) { return 1.0f; } @Override public float idf(long docFreq, long docCount) { return 1.0f; }
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) { @Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
return Explanation.match(1.0f, "Inexplicable"); return Explanation.match(1.0f, "Inexplicable");
} }

View File

@ -80,7 +80,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
} }
@Override @Override
public float idf(long docFreq, long numDocs) { public float idf(long docFreq, long docCount) {
return 1.0f; return 1.0f;
} }
} }

View File

@ -46,7 +46,7 @@ public class TestSimilarity extends LuceneTestCase {
@Override public float lengthNorm(FieldInvertState state) { return state.getBoost(); } @Override public float lengthNorm(FieldInvertState state) { return state.getBoost(); }
@Override public float tf(float freq) { return freq; } @Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; } @Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(long docFreq, long numDocs) { return 1.0f; } @Override public float idf(long docFreq, long docCount) { return 1.0f; }
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] stats) { @Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] stats) {
return Explanation.match(1.0f, "Inexplicable"); return Explanation.match(1.0f, "Inexplicable");
} }

View File

@ -142,7 +142,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
} }
@Override @Override
public float idf(long docFreq, long numDocs) { public float idf(long docFreq, long docCount) {
return 1f; return 1f;
} }
@ -190,7 +190,7 @@ public class TestSimilarityProvider extends LuceneTestCase {
} }
@Override @Override
public float idf(long docFreq, long numDocs) { public float idf(long docFreq, long docCount) {
return 10f; return 10f;
} }

View File

@ -288,7 +288,7 @@ public class TestPayloadTermQuery extends LuceneTestCase {
} }
@Override @Override
public float idf(long docFreq, long numDocs) { public float idf(long docFreq, long docCount) {
return 1; return 1;
} }

View File

@ -38,6 +38,7 @@ import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
/** /**
* Tests against all the similarities we have * Tests against all the similarities we have
@ -160,6 +161,48 @@ public class TestSimilarity2 extends LuceneTestCase {
dir.close(); dir.close();
} }
/** make sure scores are not skewed by docs not containing the field */
public void testNoFieldSkew() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("foo", "bar baz somethingelse", Field.Store.NO));
iw.addDocument(doc);
IndexReader ir = iw.getReader();
IndexSearcher is = newSearcher(ir);
BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
queryBuilder.add(new TermQuery(new Term("foo", "bar")), BooleanClause.Occur.SHOULD);
queryBuilder.add(new TermQuery(new Term("foo", "baz")), BooleanClause.Occur.SHOULD);
Query query = queryBuilder.build();
// collect scores
List<Float> scores = new ArrayList<>();
for (Similarity sim : sims) {
is.setSimilarity(sim);
scores.add(is.explain(query, 0).getValue());
}
ir.close();
// add some additional docs without the field
int numExtraDocs = TestUtil.nextInt(random(), 1, 1000);
for (int i = 0; i < numExtraDocs; i++) {
iw.addDocument(new Document());
}
// check scores are the same
ir = iw.getReader();
is = newSearcher(ir);
for (int i = 0; i < sims.size(); i++) {
is.setSimilarity(sims.get(i));
assertEquals(scores.get(i).floatValue(), is.explain(query, 0).getValue(), 0F);
}
iw.close();
ir.close();
dir.close();
}
/** make sure all sims work if TF is omitted */ /** make sure all sims work if TF is omitted */
public void testOmitTF() throws Exception { public void testOmitTF() throws Exception {
Directory dir = newDirectory(); Directory dir = newDirectory();

View File

@ -203,10 +203,10 @@ class PreciseDefaultSimilarity extends TFIDFSimilarity {
return 1; return 1;
} }
/** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */ /** Implemented as <code>log(docCount/(docFreq+1)) + 1</code>. */
@Override @Override
public float idf(long docFreq, long numDocs) { public float idf(long docFreq, long docCount) {
return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); return (float)(Math.log(docCount/(double)(docFreq+1)) + 1.0);
} }
/** /**

View File

@ -566,8 +566,8 @@ public class TestExtendedDismaxParser extends SolrTestCaseJ4 {
} }
public void testAliasingBoost() throws Exception { public void testAliasingBoost() throws Exception {
assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias", "f.myalias.qf","name trait_ss^0.5"), "//result/doc[1]/str[@name='id']=42", "//result/doc[2]/str[@name='id']=47");//doc 42 should score higher than 46 assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias", "f.myalias.qf","name trait_ss^0.1"), "//result/doc[1]/str[@name='id']=42", "//result/doc[2]/str[@name='id']=47");//doc 42 should score higher than 46
assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias^100 name", "f.myalias.qf","trait_ss^0.5"), "//result/doc[1]/str[@name='id']=47", "//result/doc[2]/str[@name='id']=42");//Now the order should be inverse assertQ(req("defType","edismax", "q","Zapp Pig", "qf","myalias^100 name", "f.myalias.qf","trait_ss^0.1"), "//result/doc[1]/str[@name='id']=47", "//result/doc[2]/str[@name='id']=42");//Now the order should be inverse
} }
public void testCyclicAliasing() throws Exception { public void testCyclicAliasing() throws Exception {