From 9dc0ba4c7bf5ba2540083afb043d49450eea851f Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 18 Jan 2016 07:48:24 +0000 Subject: [PATCH] LUCENE-6896: don't treat smallest possible norm value as an infinitely long doc in SimilarityBase or BM25Similarity git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1725178 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 ++ .../search/similarities/BM25Similarity.java | 3 +- .../search/similarities/BasicModelBE.java | 8 +-- .../search/similarities/BasicModelD.java | 2 +- .../search/similarities/DistributionSPL.java | 2 + .../search/similarities/SimilarityBase.java | 3 +- .../similarities/TestBM25Similarity.java | 36 +++++++++++ .../similarities/TestClassicSimilarity.java | 13 ++++ .../search/similarities/TestSimilarity2.java | 3 +- .../similarities/TestSimilarityBase.java | 61 +++++++++++++++++++ 10 files changed, 127 insertions(+), 8 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/search/similarities/TestBM25Similarity.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7d01a5964a8..dd3b08ac00b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -184,6 +184,10 @@ Bug Fixes EOFException if you seek past the end of the file and then try to read (Stéphane Campinas via Mike McCandless) +* LUCENE-6896: Don't treat the smallest possible norm value as an infinitely + long document in SimilarityBase or BM25Similarity. Add more warnings to sims + that will not work well with extreme tf values. (Ahmet Arslan, Robert Muir) + Other * LUCENE-6924: Upgrade randomizedtesting to 2.3.2. (Dawid Weiss) diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index 222c5a8c02d..da00fa3bb6a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -128,10 +128,11 @@ public class BM25Similarity extends Similarity { private static final float[] NORM_TABLE = new float[256]; static { - for (int i = 0; i < 256; i++) { + for (int i = 1; i < 256; i++) { float f = SmallFloat.byte315ToFloat((byte)i); NORM_TABLE[i] = 1.0f / (f*f); } + NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java index c12f8d76563..f9d2fd11b94 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelBE.java @@ -24,10 +24,10 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2; * slightly from the one in the original paper: {@code F} is increased by {@code tfn+1} * and {@code N} is increased by {@code F} * @lucene.experimental - * NOTE: in some corner cases this model may give poor performance with Normalizations that - * return large values for {@code tfn} such as NormalizationH3. Consider using the - * geometric approximation ({@link BasicModelG}) instead, which provides the same relevance - * but with less practical problems. + * NOTE: in some corner cases this model may give poor performance or infinite scores with + * Normalizations that return large or small values for {@code tfn} such as NormalizationH3. + * Consider using the geometric approximation ({@link BasicModelG}) instead, which provides + * the same relevance but with less practical problems. */ public class BasicModelBE extends BasicModel { diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java index f5b20138caa..67295c5667c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicModelD.java @@ -28,7 +28,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2; *

* WARNING: for terms that do not meet the expected random distribution * (e.g. stopwords), this model may give poor performance, such as - * abnormally high scores for low tf values. + * abnormally high or NaN scores for low tf values. * @lucene.experimental */ public class BasicModelD extends BasicModel { diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java index 1b134696e99..661abd41022 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DistributionSPL.java @@ -23,6 +23,8 @@ package org.apache.lucene.search.similarities; *

Unlike for DFR, the natural logarithm is used, as * it is faster to compute and the original paper does not express any * preference to a specific base.

+ * WARNING: this model currently returns infinite scores for very small + * tf values and negative scores for very large tf values * @lucene.experimental */ public class DistributionSPL extends Distribution { diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java index 370d5072971..f69ac17a784 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java @@ -220,10 +220,11 @@ public abstract class SimilarityBase extends Similarity { private static final float[] NORM_TABLE = new float[256]; static { - for (int i = 0; i < 256; i++) { + for (int i = 1; i < 256; i++) { float floatNorm = SmallFloat.byte315ToFloat((byte)i); NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm); } + NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf } /** Encodes the document length in the same way as {@link TFIDFSimilarity}. */ diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBM25Similarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBM25Similarity.java new file mode 100644 index 00000000000..d6b6ff7ac33 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBM25Similarity.java @@ -0,0 +1,36 @@ +package org.apache.lucene.search.similarities; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; + +public class TestBM25Similarity extends LuceneTestCase { + + public void testSaneNormValues() { + BM25Similarity sim = new BM25Similarity(); + for (int i = 0; i < 256; i++) { + float len = sim.decodeNormValue((byte) i); + assertFalse("negative len: " + len + ", byte=" + i, len < 0.0f); + assertFalse("inf len: " + len + ", byte=" + i, Float.isInfinite(len)); + assertFalse("nan len for byte=" + i, Float.isNaN(len)); + if (i > 0) { + assertTrue("len is not decreasing: " + len + ",byte=" + i, len < sim.decodeNormValue((byte)(i-1))); + } + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java index a8ef2ee127e..b92b020b4fe 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java @@ -158,4 +158,17 @@ public class TestClassicSimilarity extends LuceneTestCase { assertEquals(1, topDocs.scoreDocs.length); assertTrue(topDocs.scoreDocs[0].score != 0); } + + public void testSaneNormValues() { + ClassicSimilarity sim = new ClassicSimilarity(); + for (int i = 0; i < 256; i++) { + float boost = sim.decodeNormValue((byte) i); + assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f); + assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost)); + assertFalse("nan boost for byte=" + i, Float.isNaN(boost)); + if (i > 0) { + assertTrue("boost is not increasing: " + boost + ",byte=" + i, boost > sim.decodeNormValue((byte)(i-1))); + } + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java index db93cd26334..863f6158495 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java @@ -286,8 +286,9 @@ public class TestSimilarity2 extends LuceneTestCase { TopDocs td = is.search(query, 10); assertEquals(1, td.totalHits); float score = td.scoreDocs[0].score; - assertTrue(score >= 0.0f); + assertFalse("negative score for " + sim, score < 0.0f); assertFalse("inf score for " + sim, Float.isInfinite(score)); + assertFalse("nan score for " + sim, Float.isNaN(score)); } ir.close(); dir.close(); diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java index 3a4f431f28b..3bf6ed831fb 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java @@ -592,4 +592,65 @@ public class TestSimilarityBase extends LuceneTestCase { actual.setDiscountOverlaps(true); assertEquals(expected.computeNorm(state), actual.computeNorm(state)); } + + public void testSaneNormValues() { + for (SimilarityBase sim : sims) { + for (int i = 0; i < 256; i++) { + float len = sim.decodeNormValue((byte) i); + assertFalse("negative len: " + len + ", byte=" + i + ", sim=" + sim, len < 0.0f); + assertFalse("inf len: " + len + ", byte=" + i + ", sim=" + sim, Float.isInfinite(len)); + assertFalse("nan len for byte=" + i + ", sim=" + sim, Float.isNaN(len)); + if (i > 0) { + assertTrue("len is not decreasing: " + len + ",byte=" + i + ",sim=" + sim, len < sim.decodeNormValue((byte)(i-1))); + } + } + } + } + + /** + * make sure the similarity does not go crazy when tested against all possible norm values. + */ + public void testCrazyIndexTimeBoosts() throws Exception { + long avgLength = 750; + long docCount = 500000; + long numTokens = docCount * avgLength; + + CollectionStatistics collectionStats = new CollectionStatistics("body", docCount, docCount, numTokens, numTokens); + + long docFreq = 2000; + long totalTermFreq = 2000 * avgLength; + + TermStatistics termStats = new TermStatistics(new BytesRef("term"), docFreq, totalTermFreq); + + for (SimilarityBase sim : sims) { + if (sim instanceof IBSimilarity) { + if (((IBSimilarity)sim).getDistribution() instanceof DistributionSPL) { + // score goes infinite for tiny doc lengths and negative for huge doc lengths + // TODO: fix this + continue; + } + } else if (sim instanceof DFRSimilarity) { + BasicModel model = ((DFRSimilarity)sim).getBasicModel(); + if (model instanceof BasicModelD || model instanceof BasicModelP) { + // score goes NaN for tiny doc lengths + // TODO: fix this + continue; + } else if (model instanceof BasicModelBE) { + // score goes negative infinity for tiny doc lengths + // TODO: fix this + continue; + } + } + BasicStats stats = (BasicStats) sim.computeWeight(collectionStats, termStats); + for (float tf = 1.0f; tf <= 10.0f; tf += 1.0f) { + for (int i = 0; i < 256; i++) { + float len = sim.decodeNormValue((byte) i); + float score = sim.score(stats, tf, len); + assertFalse("negative score for " + sim + ", len=" + len + ",score=" + score, score < 0.0f); + assertFalse("inf score for " + sim + ", len=" + len, Float.isInfinite(score)); + assertFalse("nan score for " + sim + ", len=" + len, Float.isNaN(score)); + } + } + } + } }