LUCENE-3430: fix some sims with SpanQueries (which sometimes score terms with df=0/ttf=0/...)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1169589 13f79535-47bb-0310-9956-ffa450edef68
2011-09-12 01:13:23 +00:00 · 2011-09-12 01:13:23 +00:00 · 156a5c89ee
parent a027a35583
commit 156a5c89ee
11 changed files with 82 additions and 44 deletions
--- a/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java
@ -26,8 +26,8 @@ import org.apache.lucene.search.Explanation;
 public class AfterEffectB extends AfterEffect {
  @Override
  public final float score(BasicStats stats, float tfn) {
-    long F = stats.getTotalTermFreq();
-    int n = stats.getDocFreq();
+    long F = stats.getTotalTermFreq()+1;
+    int n = stats.getDocFreq()+1;
    return (F + 1) / (n * (tfn + 1));
  }
  
--- a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelBE.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelBE.java
@ -21,14 +21,14 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;

 /**
 * Limiting form of the Bose-Einstein model. The formula used in Lucene differs
- * slightly from the one in the original paper: {@code F} is increased by {@code tfn}
+ * slightly from the one in the original paper: {@code F} is increased by {@code tfn+1}
 * and {@code N} is increased by {@code F} 
 * @lucene.experimental
 */
 public class BasicModelBE extends BasicModel {
  @Override
  public final float score(BasicStats stats, float tfn) {
-    double F = stats.getTotalTermFreq() + tfn;
+    double F = stats.getTotalTermFreq() + 1 + tfn;
    // approximation only holds true when F << N, so we use N += F
    double N = F + stats.getNumberOfDocuments();
    return (float)(-log2((N - 1) * Math.E)
--- a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelD.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelD.java
@ -24,7 +24,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
 * for DFR. The formula used in Lucene differs slightly from the one in the
 * original paper: to avoid underflow for small values of {@code N} and
 * {@code F}, {@code N} is increased by {@code 1} and
- * {@code F} is always increased by {@code tfn}.
+ * {@code F} is always increased by {@code tfn+1}.
 * <p>
 * WARNING: for terms that do not meet the expected random distribution
 * (e.g. stopwords), this model may give poor performance, such as
@ -37,7 +37,7 @@ public class BasicModelD extends BasicModel {
    // we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative,
    // resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq
    // to create a 'normalized' F.
-    double F = stats.getTotalTermFreq() + tfn;
+    double F = stats.getTotalTermFreq() + 1 + tfn;
    double phi = (double)tfn / F;
    double nphi = 1 - phi;
    double p = 1.0 / (stats.getNumberOfDocuments() + 1);
--- a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelG.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelG.java
@ -21,7 +21,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;

 /**
 * Geometric as limiting form of the Bose-Einstein model.  The formula used in Lucene differs
- * slightly from the one in the original paper: {@code F} is increased by {@code tfn}
+ * slightly from the one in the original paper: {@code F} is increased by {@code 1}
 * and {@code N} is increased by {@code F}.
 * @lucene.experimental
 */
@ -29,7 +29,9 @@ public class BasicModelG extends BasicModel {
  @Override
  public final float score(BasicStats stats, float tfn) {
    // just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F)
-    double lambda = stats.getTotalTermFreq() / (double) (stats.getNumberOfDocuments() + stats.getTotalTermFreq());
+    double F = stats.getTotalTermFreq() + 1;
+    double N = stats.getNumberOfDocuments();
+    double lambda = F / (N + F);
    // -log(1 / (lambda + 1)) -> log(lambda + 1)
    return (float)(log2(lambda + 1) + tfn * log2((1 + lambda) / lambda));
  }
--- a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelP.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelP.java
@ -33,7 +33,7 @@ public class BasicModelP extends BasicModel {
  
  @Override
  public final float score(BasicStats stats, float tfn) {
-    float lambda = (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments();
+    float lambda = (float)(stats.getTotalTermFreq()+1) / (stats.getNumberOfDocuments()+1);
    return (float)(tfn * log2(tfn / lambda)
        + (lambda + 1 / (12 * tfn) - tfn) * LOG2_E
        + 0.5 * log2(2 * Math.PI * tfn));
--- a/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java
@ -144,7 +144,7 @@ public abstract class LMSimilarity extends SimilarityBase {
  public static class DefaultCollectionModel implements CollectionModel {
    @Override
    public float computeProbability(BasicStats stats) {
-      return (float)stats.getTotalTermFreq() / (stats.getNumberOfFieldTokens() +1);
+      return (stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F);
    }
    
    @Override
--- a/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java
@ -20,13 +20,13 @@ package org.apache.lucene.search.similarities;
 import org.apache.lucene.search.Explanation;

 /**
- * Computes lambda as {@code totalTermFreq / numberOfDocuments}.
+ * Computes lambda as {@code docFreq+1 / numberOfDocuments+1}.
 * @lucene.experimental
 */
 public class LambdaDF extends Lambda {
  @Override
  public final float lambda(BasicStats stats) {
-    return (float)stats.getDocFreq() / stats.getNumberOfDocuments();
+    return (stats.getDocFreq()+1F) / (stats.getNumberOfDocuments()+1F);
  }
  
  @Override
--- a/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java
@ -20,13 +20,13 @@ package org.apache.lucene.search.similarities;
 import org.apache.lucene.search.Explanation;

 /**
- * Computes lambda as {@code docFreq / numberOfDocuments}.
+ * Computes lambda as {@code totalTermFreq+1 / numberOfDocuments+1}.
 * @lucene.experimental
 */
 public class LambdaTTF extends Lambda {  
  @Override
  public final float lambda(BasicStats stats) {
-    return (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments();
+    return (stats.getTotalTermFreq()+1F) / (stats.getNumberOfDocuments()+1F);
  }

  @Override
--- a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH3.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH3.java
@ -34,7 +34,7 @@ public class NormalizationH3 extends Normalization {

  @Override
  public float tfn(BasicStats stats, float tf, float len) {
-    return (tf + mu * (stats.getTotalTermFreq() / (float)stats.getNumberOfFieldTokens())) / (len + mu) * mu;
+    return (tf + mu * ((stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F))) / (len + mu) * mu;
  }

  @Override
--- a/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
+++ b/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
@ -36,8 +36,12 @@ import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Scorer;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;

@ -215,4 +219,36 @@ public class TestSimilarity2 extends LuceneTestCase {
    ir.close();
    dir.close();
  }
+  
+  /** make sure all sims work with spanOR(termX, termY) where termY does not exist */
+  public void testCrazySpans() throws Exception {
+    // The problem: "normal" lucene queries create scorers, returning null if terms dont exist
+    // This means they never score a term that does not exist.
+    // however with spans, there is only one scorer for the whole hierarchy:
+    // inner queries are not real queries, their boosts are ignored, etc.
+    Directory dir = newDirectory();
+    RandomIndexWriter iw = new RandomIndexWriter(random, dir);
+    Document doc = new Document();
+    FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
+    doc.add(newField("foo", "bar", ft));
+    iw.addDocument(doc);
+    IndexReader ir = iw.getReader();
+    iw.close();
+    IndexSearcher is = newSearcher(ir);
+    
+    for (SimilarityProvider simProvider : simProviders) {
+      is.setSimilarityProvider(simProvider);
+      SpanTermQuery s1 = new SpanTermQuery(new Term("foo", "bar"));
+      SpanTermQuery s2 = new SpanTermQuery(new Term("foo", "baz"));
+      Query query = new SpanOrQuery(s1, s2);
+      TopDocs td = is.search(query, 10);
+      assertEquals(1, td.totalHits);
+      float score = td.scoreDocs[0].score;
+      assertTrue(score >= 0.0f);
+      assertFalse("inf score for " + simProvider, Float.isInfinite(score));
+    }
+    is.close();
+    ir.close();
+    dir.close();
+  }
 }
--- a/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
+++ b/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
@ -388,11 +388,11 @@ public class TestSimilarityBase extends LuceneTestCase {
  /** Correctness test for the Dirichlet LM model. */
  public void testLMDirichlet() throws IOException {
    float p =
-        (FREQ + 2000.0f * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f)) /
+        (FREQ + 2000.0f * (TOTAL_TERM_FREQ + 1) / (NUMBER_OF_FIELD_TOKENS + 1.0f)) /
        (DOC_LEN + 2000.0f);
    float a = 2000.0f / (DOC_LEN + 2000.0f);
    float gold = (float)(
-        Math.log(p / (a * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f))) +
+        Math.log(p / (a * (TOTAL_TERM_FREQ + 1) / (NUMBER_OF_FIELD_TOKENS + 1.0f))) +
        Math.log(a));
    correctnessTestCore(new LMDirichletSimilarity(), gold);
  }
@ -400,9 +400,9 @@ public class TestSimilarityBase extends LuceneTestCase {
  /** Correctness test for the Jelinek-Mercer LM model. */
  public void testLMJelinekMercer() throws IOException {
    float p = (1 - 0.1f) * FREQ / DOC_LEN +
-              0.1f * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f);
+              0.1f * (TOTAL_TERM_FREQ + 1) / (NUMBER_OF_FIELD_TOKENS + 1.0f);
    float gold = (float)(Math.log(
-        p / (0.1f * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f))));
+        p / (0.1f * (TOTAL_TERM_FREQ + 1) / (NUMBER_OF_FIELD_TOKENS + 1.0f))));
    correctnessTestCore(new LMJelinekMercerSimilarity(0.1f), gold);
  }
  
@ -412,7 +412,7 @@ public class TestSimilarityBase extends LuceneTestCase {
   */
  public void testLLForIB() throws IOException {
    SimilarityBase sim = new IBSimilarity(new DistributionLL(), new LambdaDF(), new Normalization.NoNormalization());
-    correctnessTestCore(sim, 4.26267987704f);
+    correctnessTestCore(sim, 4.178574562072754f);
  }
  
  /**
@ -422,7 +422,7 @@ public class TestSimilarityBase extends LuceneTestCase {
  public void testSPLForIB() throws IOException {
    SimilarityBase sim =
      new IBSimilarity(new DistributionSPL(), new LambdaTTF(), new Normalization.NoNormalization());
-    correctnessTestCore(sim, 2.24069910825f);
+    correctnessTestCore(sim, 2.2387237548828125f);
  }
  
  /** Correctness test for the PL2 DFR model. */
@ -432,11 +432,11 @@ public class TestSimilarityBase extends LuceneTestCase {
    float tfn = (float)(FREQ * SimilarityBase.log2(
        1 + AVG_FIELD_LENGTH / DOC_LEN));  // 8.1894750101
    float l = 1.0f / (tfn + 1.0f);         // 0.108820144666
-    float lambda = (1.0f * TOTAL_TERM_FREQ) / NUMBER_OF_DOCUMENTS;  // 0.7
+    float lambda = (1.0f + TOTAL_TERM_FREQ) / (1f + NUMBER_OF_DOCUMENTS);  // 0.7029703
    float p = (float)(tfn * SimilarityBase.log2(tfn / lambda) +
              (lambda + 1 / (12 * tfn) - tfn) * SimilarityBase.log2(Math.E) +
-              0.5 * SimilarityBase.log2(2 * Math.PI * tfn)); // 21.1113611585
-    float gold = l * p;                    // 2.29734137536
+              0.5 * SimilarityBase.log2(2 * Math.PI * tfn)); // 21.065619
+    float gold = l * p;                    // 2.2923636
    correctnessTestCore(sim, gold);
  }

@ -444,14 +444,14 @@ public class TestSimilarityBase extends LuceneTestCase {
  public void testIneB2() throws IOException {
    SimilarityBase sim = new DFRSimilarity(
        new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
-    correctnessTestCore(sim, 6.23455315685f);
+    correctnessTestCore(sim, 5.747603416442871f);
  }
  
  /** Correctness test for the GL1 DFR model. */
  public void testGL1() throws IOException {
    SimilarityBase sim = new DFRSimilarity(
        new BasicModelG(), new AfterEffectL(), new NormalizationH1());
-    correctnessTestCore(sim, 1.6463143825531006f);
+    correctnessTestCore(sim, 1.6390540599822998f);
  }
  
  /** Correctness test for the BEB1 DFR model. */
@ -459,34 +459,34 @@ public class TestSimilarityBase extends LuceneTestCase {
    SimilarityBase sim = new DFRSimilarity(
        new BasicModelBE(), new AfterEffectB(), new NormalizationH1());
    float tfn = FREQ * AVG_FIELD_LENGTH / DOC_LEN;  // 8.75
-    float b = (TOTAL_TERM_FREQ + 1) / (DOC_FREQ * (tfn + 1));  // 0.728205128205
-    float f = TOTAL_TERM_FREQ + tfn;
-    float n = f + NUMBER_OF_DOCUMENTS;
-    float n1 = n + f - 1;        // 256.5
-    float m1 = n + f - tfn - 2;  // 246.75
-    float n2 = f;                                      // 78.75
-    float m2 = f - tfn;                                // 70.0
+    float b = (TOTAL_TERM_FREQ + 1 + 1) / ((DOC_FREQ + 1) * (tfn + 1));  // 0.67132866
+    double f = TOTAL_TERM_FREQ + 1 + tfn;
+    double n = f + NUMBER_OF_DOCUMENTS;
+    double n1 = n + f - 1;        // 258.5
+    double m1 = n + f - tfn - 2;  // 248.75
+    double n2 = f;                                      // 79.75
+    double m2 = f - tfn;                                // 71.0
    float be = (float)(-SimilarityBase.log2(n - 1) -
-               SimilarityBase.log2(Math.E) +                   // -8.916400790508378
+               SimilarityBase.log2(Math.E) +                   // -8.924494472554715
               ((m1 + 0.5f) * SimilarityBase.log2(n1 / m1) +
-                (n1 - m1) * SimilarityBase.log2(n1)) -         // 91.85089272283668
+                (n1 - m1) * SimilarityBase.log2(n1)) -         // 91.9620374903885
               ((m2 + 0.5f) * SimilarityBase.log2(n2 / m2) +
-                (n2 - m2) * SimilarityBase.log2(n2)));         // 67.09778276257171
-               // 15.836709
-    float gold = b * be;                                       // 11.532373
+                (n2 - m2) * SimilarityBase.log2(n2)));         // 67.26544321004599
+               // 15.7720995
+    float gold = b * be;                                       // 10.588263
    correctnessTestCore(sim, gold);
  }

  /** Correctness test for the D DFR model (basic model only). */
  public void testD() throws IOException {
    SimilarityBase sim = new DFRSimilarity(new BasicModelD(), new AfterEffect.NoAfterEffect(), new Normalization.NoNormalization());
-    double totalTermFreqNorm = TOTAL_TERM_FREQ + FREQ;
-    double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1);                // 0.009900990099
-    double phi = FREQ / totalTermFreqNorm;                       // 0.09090909090909091
-    double D = phi * SimilarityBase.log2(phi / p) +            // 0.17884523239871358
+    double totalTermFreqNorm = TOTAL_TERM_FREQ + FREQ + 1;
+    double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1);                // 0.009900990099009901
+    double phi = FREQ / totalTermFreqNorm;                       // 0.08974358974358974
+    double D = phi * SimilarityBase.log2(phi / p) +            // 0.17498542370019005
              (1 - phi) * SimilarityBase.log2((1 - phi) / (1 - p));
    float gold = (float)(totalTermFreqNorm * D + 0.5 * SimilarityBase.log2(
-                 1 + 2 * Math.PI * FREQ * (1 - phi)));         // 16.449575
+                 1 + 2 * Math.PI * FREQ * (1 - phi)));         // 16.328257
    correctnessTestCore(sim, gold);
  }
  
@ -505,7 +505,7 @@ public class TestSimilarityBase extends LuceneTestCase {
  public void testIFB() throws IOException {
    SimilarityBase sim = new DFRSimilarity(
        new BasicModelIF(), new AfterEffectB(), new Normalization.NoNormalization());
-    float B = (TOTAL_TERM_FREQ + 1) / (DOC_FREQ * (FREQ + 1)); // 0.8875
+    float B = (TOTAL_TERM_FREQ + 1 + 1) / ((DOC_FREQ + 1) * (FREQ + 1)); // 0.8875
    float IF = (float)(FREQ * SimilarityBase.log2(             // 8.97759389642
               1 + (NUMBER_OF_DOCUMENTS + 1) / (TOTAL_TERM_FREQ + 0.5)));
    float gold = B * IF;                                       // 7.96761458307