LUCENE-3430: fix some sims with SpanQueries (which sometimes score terms with df=0/ttf=0/...)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1169589 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-09-12 01:13:23 +00:00
parent a027a35583
commit 156a5c89ee
11 changed files with 82 additions and 44 deletions

View File

@ -26,8 +26,8 @@ import org.apache.lucene.search.Explanation;
public class AfterEffectB extends AfterEffect {
@Override
public final float score(BasicStats stats, float tfn) {
long F = stats.getTotalTermFreq();
int n = stats.getDocFreq();
long F = stats.getTotalTermFreq()+1;
int n = stats.getDocFreq()+1;
return (F + 1) / (n * (tfn + 1));
}

View File

@ -21,14 +21,14 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* Limiting form of the Bose-Einstein model. The formula used in Lucene differs
* slightly from the one in the original paper: {@code F} is increased by {@code tfn}
* slightly from the one in the original paper: {@code F} is increased by {@code tfn+1}
* and {@code N} is increased by {@code F}
* @lucene.experimental
*/
public class BasicModelBE extends BasicModel {
@Override
public final float score(BasicStats stats, float tfn) {
double F = stats.getTotalTermFreq() + tfn;
double F = stats.getTotalTermFreq() + 1 + tfn;
// approximation only holds true when F << N, so we use N += F
double N = F + stats.getNumberOfDocuments();
return (float)(-log2((N - 1) * Math.E)

View File

@ -24,7 +24,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
* for DFR. The formula used in Lucene differs slightly from the one in the
* original paper: to avoid underflow for small values of {@code N} and
* {@code F}, {@code N} is increased by {@code 1} and
* {@code F} is always increased by {@code tfn}.
* {@code F} is always increased by {@code tfn+1}.
* <p>
* WARNING: for terms that do not meet the expected random distribution
* (e.g. stopwords), this model may give poor performance, such as
@ -37,7 +37,7 @@ public class BasicModelD extends BasicModel {
// we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative,
// resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq
// to create a 'normalized' F.
double F = stats.getTotalTermFreq() + tfn;
double F = stats.getTotalTermFreq() + 1 + tfn;
double phi = (double)tfn / F;
double nphi = 1 - phi;
double p = 1.0 / (stats.getNumberOfDocuments() + 1);

View File

@ -21,7 +21,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* Geometric as limiting form of the Bose-Einstein model. The formula used in Lucene differs
* slightly from the one in the original paper: {@code F} is increased by {@code tfn}
* slightly from the one in the original paper: {@code F} is increased by {@code 1}
* and {@code N} is increased by {@code F}.
* @lucene.experimental
*/
@ -29,7 +29,9 @@ public class BasicModelG extends BasicModel {
@Override
public final float score(BasicStats stats, float tfn) {
// just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F)
double lambda = stats.getTotalTermFreq() / (double) (stats.getNumberOfDocuments() + stats.getTotalTermFreq());
double F = stats.getTotalTermFreq() + 1;
double N = stats.getNumberOfDocuments();
double lambda = F / (N + F);
// -log(1 / (lambda + 1)) -> log(lambda + 1)
return (float)(log2(lambda + 1) + tfn * log2((1 + lambda) / lambda));
}

View File

@ -33,7 +33,7 @@ public class BasicModelP extends BasicModel {
@Override
public final float score(BasicStats stats, float tfn) {
float lambda = (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments();
float lambda = (float)(stats.getTotalTermFreq()+1) / (stats.getNumberOfDocuments()+1);
return (float)(tfn * log2(tfn / lambda)
+ (lambda + 1 / (12 * tfn) - tfn) * LOG2_E
+ 0.5 * log2(2 * Math.PI * tfn));

View File

@ -144,7 +144,7 @@ public abstract class LMSimilarity extends SimilarityBase {
public static class DefaultCollectionModel implements CollectionModel {
@Override
public float computeProbability(BasicStats stats) {
return (float)stats.getTotalTermFreq() / (stats.getNumberOfFieldTokens() +1);
return (stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F);
}
@Override

View File

@ -20,13 +20,13 @@ package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
/**
* Computes lambda as {@code totalTermFreq / numberOfDocuments}.
* Computes lambda as {@code docFreq+1 / numberOfDocuments+1}.
* @lucene.experimental
*/
public class LambdaDF extends Lambda {
@Override
public final float lambda(BasicStats stats) {
return (float)stats.getDocFreq() / stats.getNumberOfDocuments();
return (stats.getDocFreq()+1F) / (stats.getNumberOfDocuments()+1F);
}
@Override

View File

@ -20,13 +20,13 @@ package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
/**
* Computes lambda as {@code docFreq / numberOfDocuments}.
* Computes lambda as {@code totalTermFreq+1 / numberOfDocuments+1}.
* @lucene.experimental
*/
public class LambdaTTF extends Lambda {
@Override
public final float lambda(BasicStats stats) {
return (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments();
return (stats.getTotalTermFreq()+1F) / (stats.getNumberOfDocuments()+1F);
}
@Override

View File

@ -34,7 +34,7 @@ public class NormalizationH3 extends Normalization {
@Override
public float tfn(BasicStats stats, float tf, float len) {
return (tf + mu * (stats.getTotalTermFreq() / (float)stats.getNumberOfFieldTokens())) / (len + mu) * mu;
return (tf + mu * ((stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F))) / (len + mu) * mu;
}
@Override

View File

@ -36,8 +36,12 @@ import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@ -215,4 +219,36 @@ public class TestSimilarity2 extends LuceneTestCase {
ir.close();
dir.close();
}
/** make sure all sims work with spanOR(termX, termY) where termY does not exist */
public void testCrazySpans() throws Exception {
// The problem: "normal" lucene queries create scorers, returning null if terms dont exist
// This means they never score a term that does not exist.
// however with spans, there is only one scorer for the whole hierarchy:
// inner queries are not real queries, their boosts are ignored, etc.
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random, dir);
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
doc.add(newField("foo", "bar", ft));
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher is = newSearcher(ir);
for (SimilarityProvider simProvider : simProviders) {
is.setSimilarityProvider(simProvider);
SpanTermQuery s1 = new SpanTermQuery(new Term("foo", "bar"));
SpanTermQuery s2 = new SpanTermQuery(new Term("foo", "baz"));
Query query = new SpanOrQuery(s1, s2);
TopDocs td = is.search(query, 10);
assertEquals(1, td.totalHits);
float score = td.scoreDocs[0].score;
assertTrue(score >= 0.0f);
assertFalse("inf score for " + simProvider, Float.isInfinite(score));
}
is.close();
ir.close();
dir.close();
}
}

View File

@ -388,11 +388,11 @@ public class TestSimilarityBase extends LuceneTestCase {
/** Correctness test for the Dirichlet LM model. */
public void testLMDirichlet() throws IOException {
float p =
(FREQ + 2000.0f * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f)) /
(FREQ + 2000.0f * (TOTAL_TERM_FREQ + 1) / (NUMBER_OF_FIELD_TOKENS + 1.0f)) /
(DOC_LEN + 2000.0f);
float a = 2000.0f / (DOC_LEN + 2000.0f);
float gold = (float)(
Math.log(p / (a * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f))) +
Math.log(p / (a * (TOTAL_TERM_FREQ + 1) / (NUMBER_OF_FIELD_TOKENS + 1.0f))) +
Math.log(a));
correctnessTestCore(new LMDirichletSimilarity(), gold);
}
@ -400,9 +400,9 @@ public class TestSimilarityBase extends LuceneTestCase {
/** Correctness test for the Jelinek-Mercer LM model. */
public void testLMJelinekMercer() throws IOException {
float p = (1 - 0.1f) * FREQ / DOC_LEN +
0.1f * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f);
0.1f * (TOTAL_TERM_FREQ + 1) / (NUMBER_OF_FIELD_TOKENS + 1.0f);
float gold = (float)(Math.log(
p / (0.1f * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f))));
p / (0.1f * (TOTAL_TERM_FREQ + 1) / (NUMBER_OF_FIELD_TOKENS + 1.0f))));
correctnessTestCore(new LMJelinekMercerSimilarity(0.1f), gold);
}
@ -412,7 +412,7 @@ public class TestSimilarityBase extends LuceneTestCase {
*/
public void testLLForIB() throws IOException {
SimilarityBase sim = new IBSimilarity(new DistributionLL(), new LambdaDF(), new Normalization.NoNormalization());
correctnessTestCore(sim, 4.26267987704f);
correctnessTestCore(sim, 4.178574562072754f);
}
/**
@ -422,7 +422,7 @@ public class TestSimilarityBase extends LuceneTestCase {
public void testSPLForIB() throws IOException {
SimilarityBase sim =
new IBSimilarity(new DistributionSPL(), new LambdaTTF(), new Normalization.NoNormalization());
correctnessTestCore(sim, 2.24069910825f);
correctnessTestCore(sim, 2.2387237548828125f);
}
/** Correctness test for the PL2 DFR model. */
@ -432,11 +432,11 @@ public class TestSimilarityBase extends LuceneTestCase {
float tfn = (float)(FREQ * SimilarityBase.log2(
1 + AVG_FIELD_LENGTH / DOC_LEN)); // 8.1894750101
float l = 1.0f / (tfn + 1.0f); // 0.108820144666
float lambda = (1.0f * TOTAL_TERM_FREQ) / NUMBER_OF_DOCUMENTS; // 0.7
float lambda = (1.0f + TOTAL_TERM_FREQ) / (1f + NUMBER_OF_DOCUMENTS); // 0.7029703
float p = (float)(tfn * SimilarityBase.log2(tfn / lambda) +
(lambda + 1 / (12 * tfn) - tfn) * SimilarityBase.log2(Math.E) +
0.5 * SimilarityBase.log2(2 * Math.PI * tfn)); // 21.1113611585
float gold = l * p; // 2.29734137536
0.5 * SimilarityBase.log2(2 * Math.PI * tfn)); // 21.065619
float gold = l * p; // 2.2923636
correctnessTestCore(sim, gold);
}
@ -444,14 +444,14 @@ public class TestSimilarityBase extends LuceneTestCase {
public void testIneB2() throws IOException {
SimilarityBase sim = new DFRSimilarity(
new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
correctnessTestCore(sim, 6.23455315685f);
correctnessTestCore(sim, 5.747603416442871f);
}
/** Correctness test for the GL1 DFR model. */
public void testGL1() throws IOException {
SimilarityBase sim = new DFRSimilarity(
new BasicModelG(), new AfterEffectL(), new NormalizationH1());
correctnessTestCore(sim, 1.6463143825531006f);
correctnessTestCore(sim, 1.6390540599822998f);
}
/** Correctness test for the BEB1 DFR model. */
@ -459,34 +459,34 @@ public class TestSimilarityBase extends LuceneTestCase {
SimilarityBase sim = new DFRSimilarity(
new BasicModelBE(), new AfterEffectB(), new NormalizationH1());
float tfn = FREQ * AVG_FIELD_LENGTH / DOC_LEN; // 8.75
float b = (TOTAL_TERM_FREQ + 1) / (DOC_FREQ * (tfn + 1)); // 0.728205128205
float f = TOTAL_TERM_FREQ + tfn;
float n = f + NUMBER_OF_DOCUMENTS;
float n1 = n + f - 1; // 256.5
float m1 = n + f - tfn - 2; // 246.75
float n2 = f; // 78.75
float m2 = f - tfn; // 70.0
float b = (TOTAL_TERM_FREQ + 1 + 1) / ((DOC_FREQ + 1) * (tfn + 1)); // 0.67132866
double f = TOTAL_TERM_FREQ + 1 + tfn;
double n = f + NUMBER_OF_DOCUMENTS;
double n1 = n + f - 1; // 258.5
double m1 = n + f - tfn - 2; // 248.75
double n2 = f; // 79.75
double m2 = f - tfn; // 71.0
float be = (float)(-SimilarityBase.log2(n - 1) -
SimilarityBase.log2(Math.E) + // -8.916400790508378
SimilarityBase.log2(Math.E) + // -8.924494472554715
((m1 + 0.5f) * SimilarityBase.log2(n1 / m1) +
(n1 - m1) * SimilarityBase.log2(n1)) - // 91.85089272283668
(n1 - m1) * SimilarityBase.log2(n1)) - // 91.9620374903885
((m2 + 0.5f) * SimilarityBase.log2(n2 / m2) +
(n2 - m2) * SimilarityBase.log2(n2))); // 67.09778276257171
// 15.836709
float gold = b * be; // 11.532373
(n2 - m2) * SimilarityBase.log2(n2))); // 67.26544321004599
// 15.7720995
float gold = b * be; // 10.588263
correctnessTestCore(sim, gold);
}
/** Correctness test for the D DFR model (basic model only). */
public void testD() throws IOException {
SimilarityBase sim = new DFRSimilarity(new BasicModelD(), new AfterEffect.NoAfterEffect(), new Normalization.NoNormalization());
double totalTermFreqNorm = TOTAL_TERM_FREQ + FREQ;
double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1); // 0.009900990099
double phi = FREQ / totalTermFreqNorm; // 0.09090909090909091
double D = phi * SimilarityBase.log2(phi / p) + // 0.17884523239871358
double totalTermFreqNorm = TOTAL_TERM_FREQ + FREQ + 1;
double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1); // 0.009900990099009901
double phi = FREQ / totalTermFreqNorm; // 0.08974358974358974
double D = phi * SimilarityBase.log2(phi / p) + // 0.17498542370019005
(1 - phi) * SimilarityBase.log2((1 - phi) / (1 - p));
float gold = (float)(totalTermFreqNorm * D + 0.5 * SimilarityBase.log2(
1 + 2 * Math.PI * FREQ * (1 - phi))); // 16.449575
1 + 2 * Math.PI * FREQ * (1 - phi))); // 16.328257
correctnessTestCore(sim, gold);
}
@ -505,7 +505,7 @@ public class TestSimilarityBase extends LuceneTestCase {
public void testIFB() throws IOException {
SimilarityBase sim = new DFRSimilarity(
new BasicModelIF(), new AfterEffectB(), new Normalization.NoNormalization());
float B = (TOTAL_TERM_FREQ + 1) / (DOC_FREQ * (FREQ + 1)); // 0.8875
float B = (TOTAL_TERM_FREQ + 1 + 1) / ((DOC_FREQ + 1) * (FREQ + 1)); // 0.8875
float IF = (float)(FREQ * SimilarityBase.log2( // 8.97759389642
1 + (NUMBER_OF_DOCUMENTS + 1) / (TOTAL_TERM_FREQ + 0.5)));
float gold = B * IF; // 7.96761458307