LUCENE-6896: don't treat smallest possible norm value as an infinitely long doc in SimilarityBase or BM25Similarity

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1725178 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2016-01-18 07:48:24 +00:00
parent 40d290ee84
commit 9dc0ba4c7b
10 changed files with 127 additions and 8 deletions

View File

@ -184,6 +184,10 @@ Bug Fixes
EOFException if you seek past the end of the file and then try to
read (Stéphane Campinas via Mike McCandless)
* LUCENE-6896: Don't treat the smallest possible norm value as an infinitely
long document in SimilarityBase or BM25Similarity. Add more warnings to sims
that will not work well with extreme tf values. (Ahmet Arslan, Robert Muir)
Other
* LUCENE-6924: Upgrade randomizedtesting to 2.3.2. (Dawid Weiss)

View File

@ -128,10 +128,11 @@ public class BM25Similarity extends Similarity {
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++) {
for (int i = 1; i < 256; i++) {
float f = SmallFloat.byte315ToFloat((byte)i);
NORM_TABLE[i] = 1.0f / (f*f);
}
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
}

View File

@ -24,10 +24,10 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
* slightly from the one in the original paper: {@code F} is increased by {@code tfn+1}
* and {@code N} is increased by {@code F}
* @lucene.experimental
* NOTE: in some corner cases this model may give poor performance with Normalizations that
* return large values for {@code tfn} such as NormalizationH3. Consider using the
* geometric approximation ({@link BasicModelG}) instead, which provides the same relevance
* but with less practical problems.
* NOTE: in some corner cases this model may give poor performance or infinite scores with
* Normalizations that return large or small values for {@code tfn} such as NormalizationH3.
* Consider using the geometric approximation ({@link BasicModelG}) instead, which provides
* the same relevance but with less practical problems.
*/
public class BasicModelBE extends BasicModel {

View File

@ -28,7 +28,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
* <p>
* WARNING: for terms that do not meet the expected random distribution
* (e.g. stopwords), this model may give poor performance, such as
* abnormally high scores for low tf values.
* abnormally high or NaN scores for low tf values.
* @lucene.experimental
*/
public class BasicModelD extends BasicModel {

View File

@ -23,6 +23,8 @@ package org.apache.lucene.search.similarities;
* <p>Unlike for DFR, the natural logarithm is used, as
* it is faster to compute and the original paper does not express any
* preference to a specific base.</p>
* WARNING: this model currently returns infinite scores for very small
* tf values and negative scores for very large tf values
* @lucene.experimental
*/
public class DistributionSPL extends Distribution {

View File

@ -220,10 +220,11 @@ public abstract class SimilarityBase extends Similarity {
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++) {
for (int i = 1; i < 256; i++) {
float floatNorm = SmallFloat.byte315ToFloat((byte)i);
NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
}
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
}
/** Encodes the document length in the same way as {@link TFIDFSimilarity}. */

View File

@ -0,0 +1,36 @@
package org.apache.lucene.search.similarities;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase;
public class TestBM25Similarity extends LuceneTestCase {
public void testSaneNormValues() {
BM25Similarity sim = new BM25Similarity();
for (int i = 0; i < 256; i++) {
float len = sim.decodeNormValue((byte) i);
assertFalse("negative len: " + len + ", byte=" + i, len < 0.0f);
assertFalse("inf len: " + len + ", byte=" + i, Float.isInfinite(len));
assertFalse("nan len for byte=" + i, Float.isNaN(len));
if (i > 0) {
assertTrue("len is not decreasing: " + len + ",byte=" + i, len < sim.decodeNormValue((byte)(i-1)));
}
}
}
}

View File

@ -158,4 +158,17 @@ public class TestClassicSimilarity extends LuceneTestCase {
assertEquals(1, topDocs.scoreDocs.length);
assertTrue(topDocs.scoreDocs[0].score != 0);
}
public void testSaneNormValues() {
ClassicSimilarity sim = new ClassicSimilarity();
for (int i = 0; i < 256; i++) {
float boost = sim.decodeNormValue((byte) i);
assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f);
assertFalse("inf bost: " + boost + ", byte=" + i, Float.isInfinite(boost));
assertFalse("nan boost for byte=" + i, Float.isNaN(boost));
if (i > 0) {
assertTrue("boost is not increasing: " + boost + ",byte=" + i, boost > sim.decodeNormValue((byte)(i-1)));
}
}
}
}

View File

@ -286,8 +286,9 @@ public class TestSimilarity2 extends LuceneTestCase {
TopDocs td = is.search(query, 10);
assertEquals(1, td.totalHits);
float score = td.scoreDocs[0].score;
assertTrue(score >= 0.0f);
assertFalse("negative score for " + sim, score < 0.0f);
assertFalse("inf score for " + sim, Float.isInfinite(score));
assertFalse("nan score for " + sim, Float.isNaN(score));
}
ir.close();
dir.close();

View File

@ -592,4 +592,65 @@ public class TestSimilarityBase extends LuceneTestCase {
actual.setDiscountOverlaps(true);
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
}
public void testSaneNormValues() {
for (SimilarityBase sim : sims) {
for (int i = 0; i < 256; i++) {
float len = sim.decodeNormValue((byte) i);
assertFalse("negative len: " + len + ", byte=" + i + ", sim=" + sim, len < 0.0f);
assertFalse("inf len: " + len + ", byte=" + i + ", sim=" + sim, Float.isInfinite(len));
assertFalse("nan len for byte=" + i + ", sim=" + sim, Float.isNaN(len));
if (i > 0) {
assertTrue("len is not decreasing: " + len + ",byte=" + i + ",sim=" + sim, len < sim.decodeNormValue((byte)(i-1)));
}
}
}
}
/**
* make sure the similarity does not go crazy when tested against all possible norm values.
*/
public void testCrazyIndexTimeBoosts() throws Exception {
long avgLength = 750;
long docCount = 500000;
long numTokens = docCount * avgLength;
CollectionStatistics collectionStats = new CollectionStatistics("body", docCount, docCount, numTokens, numTokens);
long docFreq = 2000;
long totalTermFreq = 2000 * avgLength;
TermStatistics termStats = new TermStatistics(new BytesRef("term"), docFreq, totalTermFreq);
for (SimilarityBase sim : sims) {
if (sim instanceof IBSimilarity) {
if (((IBSimilarity)sim).getDistribution() instanceof DistributionSPL) {
// score goes infinite for tiny doc lengths and negative for huge doc lengths
// TODO: fix this
continue;
}
} else if (sim instanceof DFRSimilarity) {
BasicModel model = ((DFRSimilarity)sim).getBasicModel();
if (model instanceof BasicModelD || model instanceof BasicModelP) {
// score goes NaN for tiny doc lengths
// TODO: fix this
continue;
} else if (model instanceof BasicModelBE) {
// score goes negative infinity for tiny doc lengths
// TODO: fix this
continue;
}
}
BasicStats stats = (BasicStats) sim.computeWeight(collectionStats, termStats);
for (float tf = 1.0f; tf <= 10.0f; tf += 1.0f) {
for (int i = 0; i < 256; i++) {
float len = sim.decodeNormValue((byte) i);
float score = sim.score(stats, tf, len);
assertFalse("negative score for " + sim + ", len=" + len + ",score=" + score, score < 0.0f);
assertFalse("inf score for " + sim + ", len=" + len, Float.isInfinite(score));
assertFalse("nan score for " + sim + ", len=" + len, Float.isNaN(score));
}
}
}
}
}