LUCENE-6896: don't treat smallest possible norm value as an infinitely long doc in SimilarityBase or BM25Similarity

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1725178 13f79535-47bb-0310-9956-ffa450edef68
2016-01-18 07:48:24 +00:00 · 2016-01-18 07:48:24 +00:00 · 9dc0ba4c7b
parent 40d290ee84
commit 9dc0ba4c7b
10 changed files with 127 additions and 8 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -184,6 +184,10 @@ Bug Fixes
  EOFException if you seek past the end of the file and then try to
  read (Stéphane Campinas via Mike McCandless)

+* LUCENE-6896: Don't treat the smallest possible norm value as an infinitely
+  long document in SimilarityBase or BM25Similarity. Add more warnings to sims
+  that will not work well with extreme tf values. (Ahmet Arslan, Robert Muir)
+
 Other

 * LUCENE-6924: Upgrade randomizedtesting to 2.3.2. (Dawid Weiss)
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@ -128,10 +128,11 @@ public class BM25Similarity extends Similarity {
  private static final float[] NORM_TABLE = new float[256];

  static {
-    for (int i = 0; i < 256; i++) {
+    for (int i = 1; i < 256; i++) {
      float f = SmallFloat.byte315ToFloat((byte)i);
      NORM_TABLE[i] = 1.0f / (f*f);
    }
+    NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
  }


--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java
@ -24,10 +24,10 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
 * slightly from the one in the original paper: {@code F} is increased by {@code tfn+1}
 * and {@code N} is increased by {@code F} 
 * @lucene.experimental
- * NOTE: in some corner cases this model may give poor performance with Normalizations that
- * return large values for {@code tfn} such as NormalizationH3. Consider using the 
- * geometric approximation ({@link BasicModelG}) instead, which provides the same relevance
- * but with less practical problems. 
+ * NOTE: in some corner cases this model may give poor performance or infinite scores with 
+ * Normalizations that return large or small values for {@code tfn} such as NormalizationH3. 
+ * Consider using the geometric approximation ({@link BasicModelG}) instead, which provides 
+ * the same relevance but with less practical problems. 
 */
 public class BasicModelBE extends BasicModel {
  
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java
@ -28,7 +28,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
 * <p>
 * WARNING: for terms that do not meet the expected random distribution
 * (e.g. stopwords), this model may give poor performance, such as
- * abnormally high scores for low tf values.
+ * abnormally high or NaN scores for low tf values.
 * @lucene.experimental
 */
 public class BasicModelD extends BasicModel {
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java
@ -23,6 +23,8 @@ package org.apache.lucene.search.similarities;
 * <p>Unlike for DFR, the natural logarithm is used, as
 * it is faster to compute and the original paper does not express any
 * preference to a specific base.</p>
+ * WARNING: this model currently returns infinite scores for very small
+ * tf values and negative scores for very large tf values
 * @lucene.experimental
 */
 public class DistributionSPL extends Distribution {
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
@ -220,10 +220,11 @@ public abstract class SimilarityBase extends Similarity {
  private static final float[] NORM_TABLE = new float[256];

  static {
-    for (int i = 0; i < 256; i++) {
+    for (int i = 1; i < 256; i++) {
      float floatNorm = SmallFloat.byte315ToFloat((byte)i);
      NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
    }
+    NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
  }

  /** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBM25Similarity.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBM25Similarity.java
@ -0,0 +1,36 @@
+package org.apache.lucene.search.similarities;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestBM25Similarity extends LuceneTestCase {
+  
+  public void testSaneNormValues() {
+    BM25Similarity sim = new BM25Similarity();
+    for (int i = 0; i < 256; i++) {
+      float len = sim.decodeNormValue((byte) i);
+      assertFalse("negative len: " + len + ", byte=" + i, len < 0.0f);
+      assertFalse("inf len: " + len + ", byte=" + i, Float.isInfinite(len));
+      assertFalse("nan len for byte=" + i, Float.isNaN(len));
+      if (i > 0) {
+        assertTrue("len is not decreasing: " + len + ",byte=" + i, len < sim.decodeNormValue((byte)(i-1)));
+      }
+    }
+  }
+}
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java
@ -158,4 +158,17 @@ public class TestClassicSimilarity extends LuceneTestCase {
    assertEquals(1, topDocs.scoreDocs.length);
    assertTrue(topDocs.scoreDocs[0].score != 0);
  }
+  
+  public void testSaneNormValues() {
+    ClassicSimilarity sim = new ClassicSimilarity();
+    for (int i = 0; i < 256; i++) {
+      float boost = sim.decodeNormValue((byte) i);
+      assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);
+      assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost));
+      assertFalse("nan boost for byte=" + i, Float.isNaN(boost));
+      if (i > 0) {
+        assertTrue("boost is not increasing: " + boost + ",byte=" + i, boost > sim.decodeNormValue((byte)(i-1)));
+      }
+    }
+  }
 }
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java
@ -286,8 +286,9 @@ public class TestSimilarity2 extends LuceneTestCase {
      TopDocs td = is.search(query, 10);
      assertEquals(1, td.totalHits);
      float score = td.scoreDocs[0].score;
-      assertTrue(score >= 0.0f);
+      assertFalse("negative score for " + sim, score < 0.0f);
      assertFalse("inf score for " + sim, Float.isInfinite(score));
+      assertFalse("nan score for " + sim, Float.isNaN(score));
    }
    ir.close();
    dir.close();
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
@ -592,4 +592,65 @@ public class TestSimilarityBase extends LuceneTestCase {
    actual.setDiscountOverlaps(true);
    assertEquals(expected.computeNorm(state), actual.computeNorm(state));
  }
+  
+  public void testSaneNormValues() {
+    for (SimilarityBase sim : sims) {
+      for (int i = 0; i < 256; i++) {
+        float len = sim.decodeNormValue((byte) i);
+        assertFalse("negative len: " + len + ", byte=" + i + ", sim=" + sim, len < 0.0f);
+        assertFalse("inf len: " + len + ", byte=" + i + ", sim=" + sim, Float.isInfinite(len));
+        assertFalse("nan len for byte=" + i + ", sim=" + sim, Float.isNaN(len));
+        if (i > 0) {
+          assertTrue("len is not decreasing: " + len + ",byte=" + i + ",sim=" + sim, len < sim.decodeNormValue((byte)(i-1)));
+        }
+      }
+    }
+  }
+  
+  /**
+   * make sure the similarity does not go crazy when tested against all possible norm values.
+   */
+  public void testCrazyIndexTimeBoosts() throws Exception {
+    long avgLength = 750;
+    long docCount = 500000;
+    long numTokens = docCount * avgLength;
+   
+    CollectionStatistics collectionStats = new CollectionStatistics("body", docCount, docCount, numTokens, numTokens);
+    
+    long docFreq = 2000;
+    long totalTermFreq = 2000 * avgLength;
+    
+    TermStatistics termStats = new TermStatistics(new BytesRef("term"), docFreq, totalTermFreq);
+    
+    for (SimilarityBase sim : sims) {
+      if (sim instanceof IBSimilarity) {
+        if (((IBSimilarity)sim).getDistribution() instanceof DistributionSPL) {
+          // score goes infinite for tiny doc lengths and negative for huge doc lengths
+          // TODO: fix this
+          continue;
+        }
+      } else if (sim instanceof DFRSimilarity) {
+        BasicModel model = ((DFRSimilarity)sim).getBasicModel();
+        if (model instanceof BasicModelD || model instanceof BasicModelP) {
+          // score goes NaN for tiny doc lengths
+          // TODO: fix this
+          continue;
+        } else if (model instanceof BasicModelBE) {
+          // score goes negative infinity for tiny doc lengths
+          // TODO: fix this
+          continue;
+        }
+      }
+      BasicStats stats = (BasicStats) sim.computeWeight(collectionStats, termStats);
+      for (float tf = 1.0f; tf <= 10.0f; tf += 1.0f) {
+        for (int i = 0; i < 256; i++) {
+          float len = sim.decodeNormValue((byte) i);
+          float score = sim.score(stats, tf, len);
+          assertFalse("negative score for " + sim + ", len=" + len + ",score=" + score, score < 0.0f);
+          assertFalse("inf score for " + sim + ", len=" + len, Float.isInfinite(score));
+          assertFalse("nan score for " + sim + ", len=" + len, Float.isNaN(score));
+        }
+      }
+    }
+  }
 }