LUCENE-6896: don't treat smallest possible norm value as an infinitely long doc in SimilarityBase or BM25Similarity

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1725178 13f79535-47bb-0310-9956-ffa450edef68
2016-01-18 07:48:24 +00:00 · 2016-01-18 07:48:24 +00:00 · 9dc0ba4c7b
parent 40d290ee84
commit 9dc0ba4c7b
10 changed files with 127 additions and 8 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -184,6 +184,10 @@ Bug Fixes
  EOFException if you seek past the end of the file and then try to
  read (Stéphane Campinas via Mike McCandless)
 * LUCENE-6896: Don't treat the smallest possible norm value as an infinitely
  long document in SimilarityBase or BM25Similarity. Add more warnings to sims
  that will not work well with extreme tf values. (Ahmet Arslan, Robert Muir)
 Other
 * LUCENE-6924: Upgrade randomizedtesting to 2.3.2. (Dawid Weiss)
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@ -128,10 +128,11 @@ public class BM25Similarity extends Similarity {
  private static final float[] NORM_TABLE = new float[256];
  static {
-    for (int i = 0; i < 256; i++) {
+    for (int i = 1; i < 256; i++) {
      float f = SmallFloat.byte315ToFloat((byte)i);
      NORM_TABLE[i] = 1.0f / (f*f);
    }
    NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
  }
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java
@ -24,10 +24,10 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
 * slightly from the one in the original paper: {@code F} is increased by {@code tfn+1}
 * and {@code N} is increased by {@code F} 
 * @lucene.experimental
- * NOTE: in some corner cases this model may give poor performance with Normalizations that
+ * NOTE: in some corner cases this model may give poor performance or infinite scores with 
- * return large values for {@code tfn} such as NormalizationH3. Consider using the 
+ * Normalizations that return large or small values for {@code tfn} such as NormalizationH3. 
- * geometric approximation ({@link BasicModelG}) instead, which provides the same relevance
+ * Consider using the geometric approximation ({@link BasicModelG}) instead, which provides 
- * but with less practical problems. 
+ * the same relevance but with less practical problems. 
 */
 public class BasicModelBE extends BasicModel {
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java
@ -28,7 +28,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
 * <p>
 * WARNING: for terms that do not meet the expected random distribution
 * (e.g. stopwords), this model may give poor performance, such as
- * abnormally high scores for low tf values.
+ * abnormally high or NaN scores for low tf values.
 * @lucene.experimental
 */
 public class BasicModelD extends BasicModel {
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java
@ -23,6 +23,8 @@ package org.apache.lucene.search.similarities;
 * <p>Unlike for DFR, the natural logarithm is used, as
 * it is faster to compute and the original paper does not express any
 * preference to a specific base.</p>
 * WARNING: this model currently returns infinite scores for very small
 * tf values and negative scores for very large tf values
 * @lucene.experimental
 */
 public class DistributionSPL extends Distribution {
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
@ -220,10 +220,11 @@ public abstract class SimilarityBase extends Similarity {
  private static final float[] NORM_TABLE = new float[256];
  static {
-    for (int i = 0; i < 256; i++) {
+    for (int i = 1; i < 256; i++) {
      float floatNorm = SmallFloat.byte315ToFloat((byte)i);
      NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
    }
    NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
  }
  /** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBM25Similarity.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBM25Similarity.java
@ -0,0 +1,36 @@
 package org.apache.lucene.search.similarities;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.util.LuceneTestCase;
 public class TestBM25Similarity extends LuceneTestCase {
  public void testSaneNormValues() {
    BM25Similarity sim = new BM25Similarity();
    for (int i = 0; i < 256; i++) {
      float len = sim.decodeNormValue((byte) i);
      assertFalse("negative len: " + len + ", byte=" + i, len < 0.0f);
      assertFalse("inf len: " + len + ", byte=" + i, Float.isInfinite(len));
      assertFalse("nan len for byte=" + i, Float.isNaN(len));
      if (i > 0) {
        assertTrue("len is not decreasing: " + len + ",byte=" + i, len < sim.decodeNormValue((byte)(i-1)));
      }
    }
  }
 }
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
@ -158,4 +158,17 @@ public class TestClassicSimilarity extends LuceneTestCase {
    assertEquals(1, topDocs.scoreDocs.length);
    assertTrue(topDocs.scoreDocs[0].score != 0);
  }
  public void testSaneNormValues() {
    ClassicSimilarity sim = new ClassicSimilarity();
    for (int i = 0; i < 256; i++) {
      float boost = sim.decodeNormValue((byte) i);
      assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);
      assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost));
      assertFalse("nan boost for byte=" + i, Float.isNaN(boost));
      if (i > 0) {
        assertTrue("boost is not increasing: " + boost + ",byte=" + i, boost > sim.decodeNormValue((byte)(i-1)));
      }
    }
  }
 }
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
@ -286,8 +286,9 @@ public class TestSimilarity2 extends LuceneTestCase {
      TopDocs td = is.search(query, 10);
      assertEquals(1, td.totalHits);
      float score = td.scoreDocs[0].score;
-      assertTrue(score >= 0.0f);
+      assertFalse("negative score for " + sim, score < 0.0f);
      assertFalse("inf score for " + sim, Float.isInfinite(score));
      assertFalse("nan score for " + sim, Float.isNaN(score));
    }
    ir.close();
    dir.close();
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
@ -592,4 +592,65 @@ public class TestSimilarityBase extends LuceneTestCase {
    actual.setDiscountOverlaps(true);
    assertEquals(expected.computeNorm(state), actual.computeNorm(state));
  }
  public void testSaneNormValues() {
    for (SimilarityBase sim : sims) {
      for (int i = 0; i < 256; i++) {
        float len = sim.decodeNormValue((byte) i);
        assertFalse("negative len: " + len + ", byte=" + i + ", sim=" + sim, len < 0.0f);
        assertFalse("inf len: " + len + ", byte=" + i + ", sim=" + sim, Float.isInfinite(len));
        assertFalse("nan len for byte=" + i + ", sim=" + sim, Float.isNaN(len));
        if (i > 0) {
          assertTrue("len is not decreasing: " + len + ",byte=" + i + ",sim=" + sim, len < sim.decodeNormValue((byte)(i-1)));
        }
      }
    }
  }
  /**
   * make sure the similarity does not go crazy when tested against all possible norm values.
   */
  public void testCrazyIndexTimeBoosts() throws Exception {
    long avgLength = 750;
    long docCount = 500000;
    long numTokens = docCount * avgLength;
    CollectionStatistics collectionStats = new CollectionStatistics("body", docCount, docCount, numTokens, numTokens);
    long docFreq = 2000;
    long totalTermFreq = 2000 * avgLength;
    TermStatistics termStats = new TermStatistics(new BytesRef("term"), docFreq, totalTermFreq);
    for (SimilarityBase sim : sims) {
      if (sim instanceof IBSimilarity) {
        if (((IBSimilarity)sim).getDistribution() instanceof DistributionSPL) {
          // score goes infinite for tiny doc lengths and negative for huge doc lengths
          // TODO: fix this
          continue;
        }
      } else if (sim instanceof DFRSimilarity) {
        BasicModel model = ((DFRSimilarity)sim).getBasicModel();
        if (model instanceof BasicModelD || model instanceof BasicModelP) {
          // score goes NaN for tiny doc lengths
          // TODO: fix this
          continue;
        } else if (model instanceof BasicModelBE) {
          // score goes negative infinity for tiny doc lengths
          // TODO: fix this
          continue;
        }
      }
      BasicStats stats = (BasicStats) sim.computeWeight(collectionStats, termStats);
      for (float tf = 1.0f; tf <= 10.0f; tf += 1.0f) {
        for (int i = 0; i < 256; i++) {
          float len = sim.decodeNormValue((byte) i);
          float score = sim.score(stats, tf, len);
          assertFalse("negative score for " + sim + ", len=" + len + ",score=" + score, score < 0.0f);
          assertFalse("inf score for " + sim + ", len=" + len, Float.isInfinite(score));
          assertFalse("nan score for " + sim + ", len=" + len, Float.isNaN(score));
        }
      }
    }
  }
 }